Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 8bf605e5e76b..533c5b1f6ff0 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -1,19179 +1,19183 @@
//===--- SemaOpenMP.cpp - Semantic Analysis for OpenMP constructs ---------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
/// \file
/// This file implements semantic analysis for OpenMP directives and
/// clauses.
#include "TreeTransform.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/ASTMutationListener.h"
#include "clang/AST/CXXInheritance.h"
#include "clang/AST/Decl.h"
#include "clang/AST/DeclCXX.h"
#include "clang/AST/DeclOpenMP.h"
#include "clang/AST/OpenMPClause.h"
#include "clang/AST/StmtCXX.h"
#include "clang/AST/StmtOpenMP.h"
#include "clang/AST/StmtVisitor.h"
#include "clang/AST/TypeOrdering.h"
#include "clang/Basic/DiagnosticSema.h"
#include "clang/Basic/OpenMPKinds.h"
#include "clang/Basic/PartialDiagnostic.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Sema/Initialization.h"
#include "clang/Sema/Lookup.h"
#include "clang/Sema/Scope.h"
#include "clang/Sema/ScopeInfo.h"
#include "clang/Sema/SemaInternal.h"
#include "llvm/ADT/IndexedMap.h"
#include "llvm/ADT/PointerEmbeddedInt.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include <set>
using namespace clang;
using namespace llvm::omp;
// Stack of data-sharing attributes for variables
static const Expr *checkMapClauseExpressionBase(
Sema &SemaRef, Expr *E,
OMPClauseMappableExprCommon::MappableExprComponentList &CurComponents,
OpenMPClauseKind CKind, bool NoDiagnose);
namespace {
/// Default data sharing attributes, which can be applied to directive.
enum DefaultDataSharingAttributes {
DSA_unspecified = 0, /// Data sharing attribute not specified.
DSA_none = 1 << 0, /// Default data sharing attribute 'none'.
DSA_shared = 1 << 1, /// Default data sharing attribute 'shared'.
DSA_firstprivate = 1 << 2, /// Default data sharing attribute 'firstprivate'.
/// Stack for tracking declarations used in OpenMP directives and
/// clauses and their data-sharing attributes.
class DSAStackTy {
struct DSAVarData {
OpenMPDirectiveKind DKind = OMPD_unknown;
OpenMPClauseKind CKind = OMPC_unknown;
unsigned Modifier = 0;
const Expr *RefExpr = nullptr;
DeclRefExpr *PrivateCopy = nullptr;
SourceLocation ImplicitDSALoc;
DSAVarData() = default;
DSAVarData(OpenMPDirectiveKind DKind, OpenMPClauseKind CKind,
const Expr *RefExpr, DeclRefExpr *PrivateCopy,
SourceLocation ImplicitDSALoc, unsigned Modifier)
: DKind(DKind), CKind(CKind), Modifier(Modifier), RefExpr(RefExpr),
PrivateCopy(PrivateCopy), ImplicitDSALoc(ImplicitDSALoc) {}
using OperatorOffsetTy =
llvm::SmallVector<std::pair<Expr *, OverloadedOperatorKind>, 4>;
using DoacrossDependMapTy =
llvm::DenseMap<OMPDependClause *, OperatorOffsetTy>;
/// Kind of the declaration used in the uses_allocators clauses.
enum class UsesAllocatorsDeclKind {
/// Predefined allocator
/// User-defined allocator
/// The declaration that represent allocator trait
struct DSAInfo {
OpenMPClauseKind Attributes = OMPC_unknown;
unsigned Modifier = 0;
/// Pointer to a reference expression and a flag which shows that the
/// variable is marked as lastprivate(true) or not (false).
llvm::PointerIntPair<const Expr *, 1, bool> RefExpr;
DeclRefExpr *PrivateCopy = nullptr;
using DeclSAMapTy = llvm::SmallDenseMap<const ValueDecl *, DSAInfo, 8>;
using UsedRefMapTy = llvm::SmallDenseMap<const ValueDecl *, const Expr *, 8>;
using LCDeclInfo = std::pair<unsigned, VarDecl *>;
using LoopControlVariablesMapTy =
llvm::SmallDenseMap<const ValueDecl *, LCDeclInfo, 8>;
/// Struct that associates a component with the clause kind where they are
/// found.
struct MappedExprComponentTy {
OMPClauseMappableExprCommon::MappableExprComponentLists Components;
OpenMPClauseKind Kind = OMPC_unknown;
using MappedExprComponentsTy =
llvm::DenseMap<const ValueDecl *, MappedExprComponentTy>;
using CriticalsWithHintsTy =
llvm::StringMap<std::pair<const OMPCriticalDirective *, llvm::APSInt>>;
struct ReductionData {
using BOKPtrType = llvm::PointerEmbeddedInt<BinaryOperatorKind, 16>;
SourceRange ReductionRange;
llvm::PointerUnion<const Expr *, BOKPtrType> ReductionOp;
ReductionData() = default;
void set(BinaryOperatorKind BO, SourceRange RR) {
ReductionRange = RR;
ReductionOp = BO;
void set(const Expr *RefExpr, SourceRange RR) {
ReductionRange = RR;
ReductionOp = RefExpr;
using DeclReductionMapTy =
llvm::SmallDenseMap<const ValueDecl *, ReductionData, 4>;
struct DefaultmapInfo {
OpenMPDefaultmapClauseModifier ImplicitBehavior =
SourceLocation SLoc;
DefaultmapInfo() = default;
DefaultmapInfo(OpenMPDefaultmapClauseModifier M, SourceLocation Loc)
: ImplicitBehavior(M), SLoc(Loc) {}
struct SharingMapTy {
DeclSAMapTy SharingMap;
DeclReductionMapTy ReductionMap;
UsedRefMapTy AlignedMap;
UsedRefMapTy NontemporalMap;
MappedExprComponentsTy MappedExprComponents;
LoopControlVariablesMapTy LCVMap;
DefaultDataSharingAttributes DefaultAttr = DSA_unspecified;
SourceLocation DefaultAttrLoc;
DefaultmapInfo DefaultmapMap[OMPC_DEFAULTMAP_unknown];
OpenMPDirectiveKind Directive = OMPD_unknown;
DeclarationNameInfo DirectiveName;
Scope *CurScope = nullptr;
SourceLocation ConstructLoc;
/// Set of 'depend' clauses with 'sink|source' dependence kind. Required to
/// get the data (loop counters etc.) about enclosing loop-based construct.
/// This data is required during codegen.
DoacrossDependMapTy DoacrossDepends;
/// First argument (Expr *) contains optional argument of the
/// 'ordered' clause, the second one is true if the regions has 'ordered'
/// clause, false otherwise.
llvm::Optional<std::pair<const Expr *, OMPOrderedClause *>> OrderedRegion;
unsigned AssociatedLoops = 1;
bool HasMutipleLoops = false;
const Decl *PossiblyLoopCounter = nullptr;
bool NowaitRegion = false;
bool CancelRegion = false;
bool LoopStart = false;
bool BodyComplete = false;
SourceLocation PrevScanLocation;
SourceLocation PrevOrderedLocation;
SourceLocation InnerTeamsRegionLoc;
/// Reference to the taskgroup task_reduction reference expression.
Expr *TaskgroupReductionRef = nullptr;
llvm::DenseSet<QualType> MappedClassesQualTypes;
SmallVector<Expr *, 4> InnerUsedAllocators;
llvm::DenseSet<CanonicalDeclPtr<Decl>> ImplicitTaskFirstprivates;
/// List of globals marked as declare target link in this target region
/// (isOpenMPTargetExecutionDirective(Directive) == true).
llvm::SmallVector<DeclRefExpr *, 4> DeclareTargetLinkVarDecls;
/// List of decls used in inclusive/exclusive clauses of the scan directive.
llvm::DenseSet<CanonicalDeclPtr<Decl>> UsedInScanDirective;
llvm::DenseMap<CanonicalDeclPtr<const Decl>, UsesAllocatorsDeclKind>
SharingMapTy(OpenMPDirectiveKind DKind, DeclarationNameInfo Name,
Scope *CurScope, SourceLocation Loc)
: Directive(DKind), DirectiveName(Name), CurScope(CurScope),
ConstructLoc(Loc) {}
SharingMapTy() = default;
using StackTy = SmallVector<SharingMapTy, 4>;
/// Stack of used declaration and their data-sharing attributes.
DeclSAMapTy Threadprivates;
const FunctionScopeInfo *CurrentNonCapturingFunctionScope = nullptr;
SmallVector<std::pair<StackTy, const FunctionScopeInfo *>, 4> Stack;
/// true, if check for DSA must be from parent directive, false, if
/// from current directive.
OpenMPClauseKind ClauseKindMode = OMPC_unknown;
Sema &SemaRef;
bool ForceCapturing = false;
/// true if all the variables in the target executable directives must be
/// captured by reference.
bool ForceCaptureByReferenceInTargetExecutable = false;
CriticalsWithHintsTy Criticals;
unsigned IgnoredStackElements = 0;
/// Iterators over the stack iterate in order from innermost to outermost
/// directive.
using const_iterator = StackTy::const_reverse_iterator;
const_iterator begin() const {
return Stack.empty() ? const_iterator()
: Stack.back().first.rbegin() + IgnoredStackElements;
const_iterator end() const {
return Stack.empty() ? const_iterator() : Stack.back().first.rend();
using iterator = StackTy::reverse_iterator;
iterator begin() {
return Stack.empty() ? iterator()
: Stack.back().first.rbegin() + IgnoredStackElements;
iterator end() {
return Stack.empty() ? iterator() : Stack.back().first.rend();
// Convenience operations to get at the elements of the stack.
bool isStackEmpty() const {
return Stack.empty() ||
Stack.back().second != CurrentNonCapturingFunctionScope ||
Stack.back().first.size() <= IgnoredStackElements;
size_t getStackSize() const {
return isStackEmpty() ? 0
: Stack.back().first.size() - IgnoredStackElements;
SharingMapTy *getTopOfStackOrNull() {
size_t Size = getStackSize();
if (Size == 0)
return nullptr;
return &Stack.back().first[Size - 1];
const SharingMapTy *getTopOfStackOrNull() const {
return const_cast<DSAStackTy&>(*this).getTopOfStackOrNull();
SharingMapTy &getTopOfStack() {
assert(!isStackEmpty() && "no current directive");
return *getTopOfStackOrNull();
const SharingMapTy &getTopOfStack() const {
return const_cast<DSAStackTy&>(*this).getTopOfStack();
SharingMapTy *getSecondOnStackOrNull() {
size_t Size = getStackSize();
if (Size <= 1)
return nullptr;
return &Stack.back().first[Size - 2];
const SharingMapTy *getSecondOnStackOrNull() const {
return const_cast<DSAStackTy&>(*this).getSecondOnStackOrNull();
/// Get the stack element at a certain level (previously returned by
/// \c getNestingLevel).
/// Note that nesting levels count from outermost to innermost, and this is
/// the reverse of our iteration order where new inner levels are pushed at
/// the front of the stack.
SharingMapTy &getStackElemAtLevel(unsigned Level) {
assert(Level < getStackSize() && "no such stack element");
return Stack.back().first[Level];
const SharingMapTy &getStackElemAtLevel(unsigned Level) const {
return const_cast<DSAStackTy&>(*this).getStackElemAtLevel(Level);
DSAVarData getDSA(const_iterator &Iter, ValueDecl *D) const;
/// Checks if the variable is a local for OpenMP region.
bool isOpenMPLocal(VarDecl *D, const_iterator Iter) const;
/// Vector of previously declared requires directives
SmallVector<const OMPRequiresDecl *, 2> RequiresDecls;
/// omp_allocator_handle_t type.
QualType OMPAllocatorHandleT;
/// omp_depend_t type.
QualType OMPDependT;
/// omp_event_handle_t type.
QualType OMPEventHandleT;
/// omp_alloctrait_t type.
QualType OMPAlloctraitT;
/// Expression for the predefined allocators.
Expr *OMPPredefinedAllocators[OMPAllocateDeclAttr::OMPUserDefinedMemAlloc] = {
/// Vector of previously encountered target directives
SmallVector<SourceLocation, 2> TargetLocations;
SourceLocation AtomicLocation;
explicit DSAStackTy(Sema &S) : SemaRef(S) {}
/// Sets omp_allocator_handle_t type.
void setOMPAllocatorHandleT(QualType Ty) { OMPAllocatorHandleT = Ty; }
/// Gets omp_allocator_handle_t type.
QualType getOMPAllocatorHandleT() const { return OMPAllocatorHandleT; }
/// Sets omp_alloctrait_t type.
void setOMPAlloctraitT(QualType Ty) { OMPAlloctraitT = Ty; }
/// Gets omp_alloctrait_t type.
QualType getOMPAlloctraitT() const { return OMPAlloctraitT; }
/// Sets the given default allocator.
void setAllocator(OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind,
Expr *Allocator) {
OMPPredefinedAllocators[AllocatorKind] = Allocator;
/// Returns the specified default allocator.
Expr *getAllocator(OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind) const {
return OMPPredefinedAllocators[AllocatorKind];
/// Sets omp_depend_t type.
void setOMPDependT(QualType Ty) { OMPDependT = Ty; }
/// Gets omp_depend_t type.
QualType getOMPDependT() const { return OMPDependT; }
/// Sets omp_event_handle_t type.
void setOMPEventHandleT(QualType Ty) { OMPEventHandleT = Ty; }
/// Gets omp_event_handle_t type.
QualType getOMPEventHandleT() const { return OMPEventHandleT; }
bool isClauseParsingMode() const { return ClauseKindMode != OMPC_unknown; }
OpenMPClauseKind getClauseParsingMode() const {
assert(isClauseParsingMode() && "Must be in clause parsing mode.");
return ClauseKindMode;
void setClauseParsingMode(OpenMPClauseKind K) { ClauseKindMode = K; }
bool isBodyComplete() const {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top && Top->BodyComplete;
void setBodyComplete() {
getTopOfStack().BodyComplete = true;
bool isForceVarCapturing() const { return ForceCapturing; }
void setForceVarCapturing(bool V) { ForceCapturing = V; }
void setForceCaptureByReferenceInTargetExecutable(bool V) {
ForceCaptureByReferenceInTargetExecutable = V;
bool isForceCaptureByReferenceInTargetExecutable() const {
return ForceCaptureByReferenceInTargetExecutable;
void push(OpenMPDirectiveKind DKind, const DeclarationNameInfo &DirName,
Scope *CurScope, SourceLocation Loc) {
assert(!IgnoredStackElements &&
"cannot change stack while ignoring elements");
if (Stack.empty() ||
Stack.back().second != CurrentNonCapturingFunctionScope)
Stack.emplace_back(StackTy(), CurrentNonCapturingFunctionScope);
Stack.back().first.emplace_back(DKind, DirName, CurScope, Loc);
Stack.back().first.back().DefaultAttrLoc = Loc;
void pop() {
assert(!IgnoredStackElements &&
"cannot change stack while ignoring elements");
assert(!Stack.back().first.empty() &&
"Data-sharing attributes stack is empty!");
/// RAII object to temporarily leave the scope of a directive when we want to
/// logically operate in its parent.
class ParentDirectiveScope {
DSAStackTy &Self;
bool Active;
ParentDirectiveScope(DSAStackTy &Self, bool Activate)
: Self(Self), Active(false) {
if (Activate)
~ParentDirectiveScope() { disable(); }
void disable() {
if (Active) {
Active = false;
void enable() {
if (!Active) {
Active = true;
/// Marks that we're started loop parsing.
void loopInit() {
assert(isOpenMPLoopDirective(getCurrentDirective()) &&
"Expected loop-based directive.");
getTopOfStack().LoopStart = true;
/// Start capturing of the variables in the loop context.
void loopStart() {
assert(isOpenMPLoopDirective(getCurrentDirective()) &&
"Expected loop-based directive.");
getTopOfStack().LoopStart = false;
/// true, if variables are captured, false otherwise.
bool isLoopStarted() const {
assert(isOpenMPLoopDirective(getCurrentDirective()) &&
"Expected loop-based directive.");
return !getTopOfStack().LoopStart;
/// Marks (or clears) declaration as possibly loop counter.
void resetPossibleLoopCounter(const Decl *D = nullptr) {
getTopOfStack().PossiblyLoopCounter =
D ? D->getCanonicalDecl() : D;
/// Gets the possible loop counter decl.
const Decl *getPossiblyLoopCunter() const {
return getTopOfStack().PossiblyLoopCounter;
/// Start new OpenMP region stack in new non-capturing function.
void pushFunction() {
assert(!IgnoredStackElements &&
"cannot change stack while ignoring elements");
const FunctionScopeInfo *CurFnScope = SemaRef.getCurFunction();
CurrentNonCapturingFunctionScope = CurFnScope;
/// Pop region stack for non-capturing function.
void popFunction(const FunctionScopeInfo *OldFSI) {
assert(!IgnoredStackElements &&
"cannot change stack while ignoring elements");
if (!Stack.empty() && Stack.back().second == OldFSI) {
CurrentNonCapturingFunctionScope = nullptr;
for (const FunctionScopeInfo *FSI : llvm::reverse(SemaRef.FunctionScopes)) {
if (!isa<CapturingScopeInfo>(FSI)) {
CurrentNonCapturingFunctionScope = FSI;
void addCriticalWithHint(const OMPCriticalDirective *D, llvm::APSInt Hint) {
Criticals.try_emplace(D->getDirectiveName().getAsString(), D, Hint);
const std::pair<const OMPCriticalDirective *, llvm::APSInt>
getCriticalWithHint(const DeclarationNameInfo &Name) const {
auto I = Criticals.find(Name.getAsString());
if (I != Criticals.end())
return I->second;
return std::make_pair(nullptr, llvm::APSInt());
/// If 'aligned' declaration for given variable \a D was not seen yet,
/// add it and return NULL; otherwise return previous occurrence's expression
/// for diagnostics.
const Expr *addUniqueAligned(const ValueDecl *D, const Expr *NewDE);
/// If 'nontemporal' declaration for given variable \a D was not seen yet,
/// add it and return NULL; otherwise return previous occurrence's expression
/// for diagnostics.
const Expr *addUniqueNontemporal(const ValueDecl *D, const Expr *NewDE);
/// Register specified variable as loop control variable.
void addLoopControlVariable(const ValueDecl *D, VarDecl *Capture);
/// Check if the specified variable is a loop control variable for
/// current region.
/// \return The index of the loop control variable in the list of associated
/// for-loops (from outer to inner).
const LCDeclInfo isLoopControlVariable(const ValueDecl *D) const;
/// Check if the specified variable is a loop control variable for
/// parent region.
/// \return The index of the loop control variable in the list of associated
/// for-loops (from outer to inner).
const LCDeclInfo isParentLoopControlVariable(const ValueDecl *D) const;
/// Check if the specified variable is a loop control variable for
/// current region.
/// \return The index of the loop control variable in the list of associated
/// for-loops (from outer to inner).
const LCDeclInfo isLoopControlVariable(const ValueDecl *D,
unsigned Level) const;
/// Get the loop control variable for the I-th loop (or nullptr) in
/// parent directive.
const ValueDecl *getParentLoopControlVariable(unsigned I) const;
/// Marks the specified decl \p D as used in scan directive.
void markDeclAsUsedInScanDirective(ValueDecl *D) {
if (SharingMapTy *Stack = getSecondOnStackOrNull())
/// Checks if the specified declaration was used in the inner scan directive.
bool isUsedInScanDirective(ValueDecl *D) const {
if (const SharingMapTy *Stack = getTopOfStackOrNull())
return Stack->UsedInScanDirective.count(D) > 0;
return false;
/// Adds explicit data sharing attribute to the specified declaration.
void addDSA(const ValueDecl *D, const Expr *E, OpenMPClauseKind A,
DeclRefExpr *PrivateCopy = nullptr, unsigned Modifier = 0);
/// Adds additional information for the reduction items with the reduction id
/// represented as an operator.
void addTaskgroupReductionData(const ValueDecl *D, SourceRange SR,
BinaryOperatorKind BOK);
/// Adds additional information for the reduction items with the reduction id
/// represented as reduction identifier.
void addTaskgroupReductionData(const ValueDecl *D, SourceRange SR,
const Expr *ReductionRef);
/// Returns the location and reduction operation from the innermost parent
/// region for the given \p D.
const DSAVarData
getTopMostTaskgroupReductionData(const ValueDecl *D, SourceRange &SR,
BinaryOperatorKind &BOK,
Expr *&TaskgroupDescriptor) const;
/// Returns the location and reduction operation from the innermost parent
/// region for the given \p D.
const DSAVarData
getTopMostTaskgroupReductionData(const ValueDecl *D, SourceRange &SR,
const Expr *&ReductionRef,
Expr *&TaskgroupDescriptor) const;
/// Return reduction reference expression for the current taskgroup or
/// parallel/worksharing directives with task reductions.
Expr *getTaskgroupReductionRef() const {
assert((getTopOfStack().Directive == OMPD_taskgroup ||
((isOpenMPParallelDirective(getTopOfStack().Directive) ||
isOpenMPWorksharingDirective(getTopOfStack().Directive)) &&
!isOpenMPSimdDirective(getTopOfStack().Directive))) &&
"taskgroup reference expression requested for non taskgroup or "
"parallel/worksharing directive.");
return getTopOfStack().TaskgroupReductionRef;
/// Checks if the given \p VD declaration is actually a taskgroup reduction
/// descriptor variable at the \p Level of OpenMP regions.
bool isTaskgroupReductionRef(const ValueDecl *VD, unsigned Level) const {
return getStackElemAtLevel(Level).TaskgroupReductionRef &&
->getDecl() == VD;
/// Returns data sharing attributes from top of the stack for the
/// specified declaration.
const DSAVarData getTopDSA(ValueDecl *D, bool FromParent);
/// Returns data-sharing attributes for the specified declaration.
const DSAVarData getImplicitDSA(ValueDecl *D, bool FromParent) const;
/// Returns data-sharing attributes for the specified declaration.
const DSAVarData getImplicitDSA(ValueDecl *D, unsigned Level) const;
/// Checks if the specified variables has data-sharing attributes which
/// match specified \a CPred predicate in any directive which matches \a DPred
/// predicate.
const DSAVarData
hasDSA(ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
bool FromParent) const;
/// Checks if the specified variables has data-sharing attributes which
/// match specified \a CPred predicate in any innermost directive which
/// matches \a DPred predicate.
const DSAVarData
hasInnermostDSA(ValueDecl *D,
const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
bool FromParent) const;
/// Checks if the specified variables has explicit data-sharing
/// attributes which match specified \a CPred predicate at the specified
/// OpenMP region.
bool hasExplicitDSA(const ValueDecl *D,
const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
unsigned Level, bool NotLastprivate = false) const;
/// Returns true if the directive at level \Level matches in the
/// specified \a DPred predicate.
bool hasExplicitDirective(
const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
unsigned Level) const;
/// Finds a directive which matches specified \a DPred predicate.
bool hasDirective(
const llvm::function_ref<bool(
OpenMPDirectiveKind, const DeclarationNameInfo &, SourceLocation)>
bool FromParent) const;
/// Returns currently analyzed directive.
OpenMPDirectiveKind getCurrentDirective() const {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top ? Top->Directive : OMPD_unknown;
/// Returns directive kind at specified level.
OpenMPDirectiveKind getDirective(unsigned Level) const {
assert(!isStackEmpty() && "No directive at specified level.");
return getStackElemAtLevel(Level).Directive;
/// Returns the capture region at the specified level.
OpenMPDirectiveKind getCaptureRegion(unsigned Level,
unsigned OpenMPCaptureLevel) const {
SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, getDirective(Level));
return CaptureRegions[OpenMPCaptureLevel];
/// Returns parent directive.
OpenMPDirectiveKind getParentDirective() const {
const SharingMapTy *Parent = getSecondOnStackOrNull();
return Parent ? Parent->Directive : OMPD_unknown;
/// Add requires decl to internal vector
void addRequiresDecl(OMPRequiresDecl *RD) {
/// Checks if the defined 'requires' directive has specified type of clause.
template <typename ClauseType>
bool hasRequiresDeclWithClause() const {
return llvm::any_of(RequiresDecls, [](const OMPRequiresDecl *D) {
return llvm::any_of(D->clauselists(), [](const OMPClause *C) {
return isa<ClauseType>(C);
/// Checks for a duplicate clause amongst previously declared requires
/// directives
bool hasDuplicateRequiresClause(ArrayRef<OMPClause *> ClauseList) const {
bool IsDuplicate = false;
for (OMPClause *CNew : ClauseList) {
for (const OMPRequiresDecl *D : RequiresDecls) {
for (const OMPClause *CPrev : D->clauselists()) {
if (CNew->getClauseKind() == CPrev->getClauseKind()) {
<< getOpenMPClauseName(CNew->getClauseKind());
<< getOpenMPClauseName(CPrev->getClauseKind());
IsDuplicate = true;
return IsDuplicate;
/// Add location of previously encountered target to internal vector
void addTargetDirLocation(SourceLocation LocStart) {
/// Add location for the first encountered atomicc directive.
void addAtomicDirectiveLoc(SourceLocation Loc) {
if (AtomicLocation.isInvalid())
AtomicLocation = Loc;
/// Returns the location of the first encountered atomic directive in the
/// module.
SourceLocation getAtomicDirectiveLoc() const {
return AtomicLocation;
// Return previously encountered target region locations.
ArrayRef<SourceLocation> getEncounteredTargetLocs() const {
return TargetLocations;
/// Set default data sharing attribute to none.
void setDefaultDSANone(SourceLocation Loc) {
getTopOfStack().DefaultAttr = DSA_none;
getTopOfStack().DefaultAttrLoc = Loc;
/// Set default data sharing attribute to shared.
void setDefaultDSAShared(SourceLocation Loc) {
getTopOfStack().DefaultAttr = DSA_shared;
getTopOfStack().DefaultAttrLoc = Loc;
/// Set default data sharing attribute to firstprivate.
void setDefaultDSAFirstPrivate(SourceLocation Loc) {
getTopOfStack().DefaultAttr = DSA_firstprivate;
getTopOfStack().DefaultAttrLoc = Loc;
/// Set default data mapping attribute to Modifier:Kind
void setDefaultDMAAttr(OpenMPDefaultmapClauseModifier M,
OpenMPDefaultmapClauseKind Kind,
SourceLocation Loc) {
DefaultmapInfo &DMI = getTopOfStack().DefaultmapMap[Kind];
DMI.ImplicitBehavior = M;
DMI.SLoc = Loc;
/// Check whether the implicit-behavior has been set in defaultmap
bool checkDefaultmapCategory(OpenMPDefaultmapClauseKind VariableCategory) {
if (VariableCategory == OMPC_DEFAULTMAP_unknown)
return getTopOfStack()
.ImplicitBehavior != OMPC_DEFAULTMAP_MODIFIER_unknown ||
.ImplicitBehavior != OMPC_DEFAULTMAP_MODIFIER_unknown ||
.ImplicitBehavior != OMPC_DEFAULTMAP_MODIFIER_unknown;
return getTopOfStack().DefaultmapMap[VariableCategory].ImplicitBehavior !=
DefaultDataSharingAttributes getDefaultDSA(unsigned Level) const {
return getStackSize() <= Level ? DSA_unspecified
: getStackElemAtLevel(Level).DefaultAttr;
DefaultDataSharingAttributes getDefaultDSA() const {
return isStackEmpty() ? DSA_unspecified
: getTopOfStack().DefaultAttr;
SourceLocation getDefaultDSALocation() const {
return isStackEmpty() ? SourceLocation()
: getTopOfStack().DefaultAttrLoc;
getDefaultmapModifier(OpenMPDefaultmapClauseKind Kind) const {
return isStackEmpty()
: getTopOfStack().DefaultmapMap[Kind].ImplicitBehavior;
getDefaultmapModifierAtLevel(unsigned Level,
OpenMPDefaultmapClauseKind Kind) const {
return getStackElemAtLevel(Level).DefaultmapMap[Kind].ImplicitBehavior;
bool isDefaultmapCapturedByRef(unsigned Level,
OpenMPDefaultmapClauseKind Kind) const {
OpenMPDefaultmapClauseModifier M =
getDefaultmapModifierAtLevel(Level, Kind);
if (Kind == OMPC_DEFAULTMAP_scalar || Kind == OMPC_DEFAULTMAP_pointer) {
return (M == OMPC_DEFAULTMAP_MODIFIER_alloc) ||
return true;
static bool mustBeFirstprivateBase(OpenMPDefaultmapClauseModifier M,
OpenMPDefaultmapClauseKind Kind) {
switch (Kind) {
case OMPC_DEFAULTMAP_scalar:
case OMPC_DEFAULTMAP_pointer:
return (M == OMPC_DEFAULTMAP_MODIFIER_unknown) ||
(M == OMPC_DEFAULTMAP_MODIFIER_firstprivate) ||
case OMPC_DEFAULTMAP_aggregate:
return M == OMPC_DEFAULTMAP_MODIFIER_firstprivate;
llvm_unreachable("Unexpected OpenMPDefaultmapClauseKind enum");
bool mustBeFirstprivateAtLevel(unsigned Level,
OpenMPDefaultmapClauseKind Kind) const {
OpenMPDefaultmapClauseModifier M =
getDefaultmapModifierAtLevel(Level, Kind);
return mustBeFirstprivateBase(M, Kind);
bool mustBeFirstprivate(OpenMPDefaultmapClauseKind Kind) const {
OpenMPDefaultmapClauseModifier M = getDefaultmapModifier(Kind);
return mustBeFirstprivateBase(M, Kind);
/// Checks if the specified variable is a threadprivate.
bool isThreadPrivate(VarDecl *D) {
const DSAVarData DVar = getTopDSA(D, false);
return isOpenMPThreadPrivate(DVar.CKind);
/// Marks current region as ordered (it has an 'ordered' clause).
void setOrderedRegion(bool IsOrdered, const Expr *Param,
OMPOrderedClause *Clause) {
if (IsOrdered)
getTopOfStack().OrderedRegion.emplace(Param, Clause);
/// Returns true, if region is ordered (has associated 'ordered' clause),
/// false - otherwise.
bool isOrderedRegion() const {
if (const SharingMapTy *Top = getTopOfStackOrNull())
return Top->OrderedRegion.hasValue();
return false;
/// Returns optional parameter for the ordered region.
std::pair<const Expr *, OMPOrderedClause *> getOrderedRegionParam() const {
if (const SharingMapTy *Top = getTopOfStackOrNull())
if (Top->OrderedRegion.hasValue())
return Top->OrderedRegion.getValue();
return std::make_pair(nullptr, nullptr);
/// Returns true, if parent region is ordered (has associated
/// 'ordered' clause), false - otherwise.
bool isParentOrderedRegion() const {
if (const SharingMapTy *Parent = getSecondOnStackOrNull())
return Parent->OrderedRegion.hasValue();
return false;
/// Returns optional parameter for the ordered region.
std::pair<const Expr *, OMPOrderedClause *>
getParentOrderedRegionParam() const {
if (const SharingMapTy *Parent = getSecondOnStackOrNull())
if (Parent->OrderedRegion.hasValue())
return Parent->OrderedRegion.getValue();
return std::make_pair(nullptr, nullptr);
/// Marks current region as nowait (it has a 'nowait' clause).
void setNowaitRegion(bool IsNowait = true) {
getTopOfStack().NowaitRegion = IsNowait;
/// Returns true, if parent region is nowait (has associated
/// 'nowait' clause), false - otherwise.
bool isParentNowaitRegion() const {
if (const SharingMapTy *Parent = getSecondOnStackOrNull())
return Parent->NowaitRegion;
return false;
/// Marks parent region as cancel region.
void setParentCancelRegion(bool Cancel = true) {
if (SharingMapTy *Parent = getSecondOnStackOrNull())
Parent->CancelRegion |= Cancel;
/// Return true if current region has inner cancel construct.
bool isCancelRegion() const {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top ? Top->CancelRegion : false;
/// Mark that parent region already has scan directive.
void setParentHasScanDirective(SourceLocation Loc) {
if (SharingMapTy *Parent = getSecondOnStackOrNull())
Parent->PrevScanLocation = Loc;
/// Return true if current region has inner cancel construct.
bool doesParentHasScanDirective() const {
const SharingMapTy *Top = getSecondOnStackOrNull();
return Top ? Top->PrevScanLocation.isValid() : false;
/// Return true if current region has inner cancel construct.
SourceLocation getParentScanDirectiveLoc() const {
const SharingMapTy *Top = getSecondOnStackOrNull();
return Top ? Top->PrevScanLocation : SourceLocation();
/// Mark that parent region already has ordered directive.
void setParentHasOrderedDirective(SourceLocation Loc) {
if (SharingMapTy *Parent = getSecondOnStackOrNull())
Parent->PrevOrderedLocation = Loc;
/// Return true if current region has inner ordered construct.
bool doesParentHasOrderedDirective() const {
const SharingMapTy *Top = getSecondOnStackOrNull();
return Top ? Top->PrevOrderedLocation.isValid() : false;
/// Returns the location of the previously specified ordered directive.
SourceLocation getParentOrderedDirectiveLoc() const {
const SharingMapTy *Top = getSecondOnStackOrNull();
return Top ? Top->PrevOrderedLocation : SourceLocation();
/// Set collapse value for the region.
void setAssociatedLoops(unsigned Val) {
getTopOfStack().AssociatedLoops = Val;
if (Val > 1)
getTopOfStack().HasMutipleLoops = true;
/// Return collapse value for region.
unsigned getAssociatedLoops() const {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top ? Top->AssociatedLoops : 0;
/// Returns true if the construct is associated with multiple loops.
bool hasMutipleLoops() const {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top ? Top->HasMutipleLoops : false;
/// Marks current target region as one with closely nested teams
/// region.
void setParentTeamsRegionLoc(SourceLocation TeamsRegionLoc) {
if (SharingMapTy *Parent = getSecondOnStackOrNull())
Parent->InnerTeamsRegionLoc = TeamsRegionLoc;
/// Returns true, if current region has closely nested teams region.
bool hasInnerTeamsRegion() const {
return getInnerTeamsRegionLoc().isValid();
/// Returns location of the nested teams region (if any).
SourceLocation getInnerTeamsRegionLoc() const {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top ? Top->InnerTeamsRegionLoc : SourceLocation();
Scope *getCurScope() const {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top ? Top->CurScope : nullptr;
SourceLocation getConstructLoc() const {
const SharingMapTy *Top = getTopOfStackOrNull();
return Top ? Top->ConstructLoc : SourceLocation();
/// Do the check specified in \a Check to all component lists and return true
/// if any issue is found.
bool checkMappableExprComponentListsForDecl(
const ValueDecl *VD, bool CurrentRegionOnly,
const llvm::function_ref<
Check) const {
if (isStackEmpty())
return false;
auto SI = begin();
auto SE = end();
if (SI == SE)
return false;
if (CurrentRegionOnly)
SE = std::next(SI);
std::advance(SI, 1);
for (; SI != SE; ++SI) {
auto MI = SI->MappedExprComponents.find(VD);
if (MI != SI->MappedExprComponents.end())
for (OMPClauseMappableExprCommon::MappableExprComponentListRef L :
if (Check(L, MI->second.Kind))
return true;
return false;
/// Do the check specified in \a Check to all component lists at a given level
/// and return true if any issue is found.
bool checkMappableExprComponentListsForDeclAtLevel(
const ValueDecl *VD, unsigned Level,
const llvm::function_ref<
Check) const {
if (getStackSize() <= Level)
return false;
const SharingMapTy &StackElem = getStackElemAtLevel(Level);
auto MI = StackElem.MappedExprComponents.find(VD);
if (MI != StackElem.MappedExprComponents.end())
for (OMPClauseMappableExprCommon::MappableExprComponentListRef L :
if (Check(L, MI->second.Kind))
return true;
return false;
/// Create a new mappable expression component list associated with a given
/// declaration and initialize it with the provided list of components.
void addMappableExpressionComponents(
const ValueDecl *VD,
OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
OpenMPClauseKind WhereFoundClauseKind) {
MappedExprComponentTy &MEC = getTopOfStack().MappedExprComponents[VD];
// Create new entry and append the new components there.
MEC.Components.resize(MEC.Components.size() + 1);
MEC.Components.back().append(Components.begin(), Components.end());
MEC.Kind = WhereFoundClauseKind;
unsigned getNestingLevel() const {
return getStackSize() - 1;
void addDoacrossDependClause(OMPDependClause *C,
const OperatorOffsetTy &OpsOffs) {
SharingMapTy *Parent = getSecondOnStackOrNull();
assert(Parent && isOpenMPWorksharingDirective(Parent->Directive));
Parent->DoacrossDepends.try_emplace(C, OpsOffs);
getDoacrossDependClauses() const {
const SharingMapTy &StackElem = getTopOfStack();
if (isOpenMPWorksharingDirective(StackElem.Directive)) {
const DoacrossDependMapTy &Ref = StackElem.DoacrossDepends;
return llvm::make_range(Ref.begin(), Ref.end());
return llvm::make_range(StackElem.DoacrossDepends.end(),
// Store types of classes which have been explicitly mapped
void addMappedClassesQualTypes(QualType QT) {
SharingMapTy &StackElem = getTopOfStack();
// Return set of mapped classes types
bool isClassPreviouslyMapped(QualType QT) const {
const SharingMapTy &StackElem = getTopOfStack();
return StackElem.MappedClassesQualTypes.count(QT) != 0;
/// Adds global declare target to the parent target region.
void addToParentTargetRegionLinkGlobals(DeclRefExpr *E) {
E->getDecl()) == OMPDeclareTargetDeclAttr::MT_Link &&
"Expected declare target link global.");
for (auto &Elem : *this) {
if (isOpenMPTargetExecutionDirective(Elem.Directive)) {
/// Returns the list of globals with declare target link if current directive
/// is target.
ArrayRef<DeclRefExpr *> getLinkGlobals() const {
assert(isOpenMPTargetExecutionDirective(getCurrentDirective()) &&
"Expected target executable directive.");
return getTopOfStack().DeclareTargetLinkVarDecls;
/// Adds list of allocators expressions.
void addInnerAllocatorExpr(Expr *E) {
/// Return list of used allocators.
ArrayRef<Expr *> getInnerAllocators() const {
return getTopOfStack().InnerUsedAllocators;
/// Marks the declaration as implicitly firstprivate nin the task-based
/// regions.
void addImplicitTaskFirstprivate(unsigned Level, Decl *D) {
/// Checks if the decl is implicitly firstprivate in the task-based region.
bool isImplicitTaskFirstprivate(Decl *D) const {
return getTopOfStack().ImplicitTaskFirstprivates.count(D) > 0;
/// Marks decl as used in uses_allocators clause as the allocator.
void addUsesAllocatorsDecl(const Decl *D, UsesAllocatorsDeclKind Kind) {
getTopOfStack().UsesAllocatorsDecls.try_emplace(D, Kind);
/// Checks if specified decl is used in uses allocator clause as the
/// allocator.
Optional<UsesAllocatorsDeclKind> isUsesAllocatorsDecl(unsigned Level,
const Decl *D) const {
const SharingMapTy &StackElem = getTopOfStack();
auto I = StackElem.UsesAllocatorsDecls.find(D);
if (I == StackElem.UsesAllocatorsDecls.end())
return None;
return I->getSecond();
Optional<UsesAllocatorsDeclKind> isUsesAllocatorsDecl(const Decl *D) const {
const SharingMapTy &StackElem = getTopOfStack();
auto I = StackElem.UsesAllocatorsDecls.find(D);
if (I == StackElem.UsesAllocatorsDecls.end())
return None;
return I->getSecond();
bool isImplicitTaskingRegion(OpenMPDirectiveKind DKind) {
return isOpenMPParallelDirective(DKind) || isOpenMPTeamsDirective(DKind);
bool isImplicitOrExplicitTaskingRegion(OpenMPDirectiveKind DKind) {
return isImplicitTaskingRegion(DKind) || isOpenMPTaskingDirective(DKind) ||
DKind == OMPD_unknown;
} // namespace
static const Expr *getExprAsWritten(const Expr *E) {
if (const auto *FE = dyn_cast<FullExpr>(E))
E = FE->getSubExpr();
if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(E))
E = MTE->getSubExpr();
while (const auto *Binder = dyn_cast<CXXBindTemporaryExpr>(E))
E = Binder->getSubExpr();
if (const auto *ICE = dyn_cast<ImplicitCastExpr>(E))
E = ICE->getSubExprAsWritten();
return E->IgnoreParens();
static Expr *getExprAsWritten(Expr *E) {
return const_cast<Expr *>(getExprAsWritten(const_cast<const Expr *>(E)));
static const ValueDecl *getCanonicalDecl(const ValueDecl *D) {
if (const auto *CED = dyn_cast<OMPCapturedExprDecl>(D))
if (const auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
D = ME->getMemberDecl();
const auto *VD = dyn_cast<VarDecl>(D);
const auto *FD = dyn_cast<FieldDecl>(D);
if (VD != nullptr) {
VD = VD->getCanonicalDecl();
D = VD;
} else {
FD = FD->getCanonicalDecl();
D = FD;
return D;
static ValueDecl *getCanonicalDecl(ValueDecl *D) {
return const_cast<ValueDecl *>(
getCanonicalDecl(const_cast<const ValueDecl *>(D)));
DSAStackTy::DSAVarData DSAStackTy::getDSA(const_iterator &Iter,
ValueDecl *D) const {
D = getCanonicalDecl(D);
auto *VD = dyn_cast<VarDecl>(D);
const auto *FD = dyn_cast<FieldDecl>(D);
DSAVarData DVar;
if (Iter == end()) {
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a region but not in construct]
// File-scope or namespace-scope variables referenced in called routines
// in the region are shared unless they appear in a threadprivate
// directive.
if (VD && !VD->isFunctionOrMethodVarDecl() && !isa<ParmVarDecl>(VD))
DVar.CKind = OMPC_shared;
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a region but not in construct]
// Variables with static storage duration that are declared in called
// routines in the region are shared.
if (VD && VD->hasGlobalStorage())
DVar.CKind = OMPC_shared;
// Non-static data members are shared by default.
if (FD)
DVar.CKind = OMPC_shared;
return DVar;
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, C/C++, predetermined, p.1]
// Variables with automatic storage duration that are declared in a scope
// inside the construct are private.
if (VD && isOpenMPLocal(VD, Iter) && VD->isLocalVarDecl() &&
(VD->getStorageClass() == SC_Auto || VD->getStorageClass() == SC_None)) {
DVar.CKind = OMPC_private;
return DVar;
DVar.DKind = Iter->Directive;
// Explicitly specified attributes and local variables with predetermined
// attributes.
if (Iter->SharingMap.count(D)) {
const DSAInfo &Data = Iter->SharingMap.lookup(D);
DVar.RefExpr = Data.RefExpr.getPointer();
DVar.PrivateCopy = Data.PrivateCopy;
DVar.CKind = Data.Attributes;
DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
DVar.Modifier = Data.Modifier;
return DVar;
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, C/C++, implicitly determined, p.1]
// In a parallel or task construct, the data-sharing attributes of these
// variables are determined by the default clause, if present.
switch (Iter->DefaultAttr) {
case DSA_shared:
DVar.CKind = OMPC_shared;
DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
return DVar;
case DSA_none:
return DVar;
case DSA_firstprivate:
if (VD->getStorageDuration() == SD_Static &&
VD->getDeclContext()->isFileContext()) {
DVar.CKind = OMPC_unknown;
} else {
DVar.CKind = OMPC_firstprivate;
DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
return DVar;
case DSA_unspecified:
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, implicitly determined, p.2]
// In a parallel construct, if no default clause is present, these
// variables are shared.
DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
if ((isOpenMPParallelDirective(DVar.DKind) &&
!isOpenMPTaskLoopDirective(DVar.DKind)) ||
isOpenMPTeamsDirective(DVar.DKind)) {
DVar.CKind = OMPC_shared;
return DVar;
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, implicitly determined, p.4]
// In a task construct, if no default clause is present, a variable that in
// the enclosing context is determined to be shared by all implicit tasks
// bound to the current team is shared.
if (isOpenMPTaskingDirective(DVar.DKind)) {
DSAVarData DVarTemp;
const_iterator I = Iter, E = end();
do {
// OpenMP [, Data-sharing Attribute Rules for Variables
// Referenced in a Construct, implicitly determined, p.6]
// In a task construct, if no default clause is present, a variable
// whose data-sharing attribute is not determined by the rules above is
// firstprivate.
DVarTemp = getDSA(I, D);
if (DVarTemp.CKind != OMPC_shared) {
DVar.RefExpr = nullptr;
DVar.CKind = OMPC_firstprivate;
return DVar;
} while (I != E && !isImplicitTaskingRegion(I->Directive));
DVar.CKind =
(DVarTemp.CKind == OMPC_unknown) ? OMPC_firstprivate : OMPC_shared;
return DVar;
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, implicitly determined, p.3]
// For constructs other than task, if no default clause is present, these
// variables inherit their data-sharing attributes from the enclosing
// context.
return getDSA(++Iter, D);
const Expr *DSAStackTy::addUniqueAligned(const ValueDecl *D,
const Expr *NewDE) {
assert(!isStackEmpty() && "Data sharing attributes stack is empty");
D = getCanonicalDecl(D);
SharingMapTy &StackElem = getTopOfStack();
auto It = StackElem.AlignedMap.find(D);
if (It == StackElem.AlignedMap.end()) {
assert(NewDE && "Unexpected nullptr expr to be added into aligned map");
StackElem.AlignedMap[D] = NewDE;
return nullptr;
assert(It->second && "Unexpected nullptr expr in the aligned map");
return It->second;
const Expr *DSAStackTy::addUniqueNontemporal(const ValueDecl *D,
const Expr *NewDE) {
assert(!isStackEmpty() && "Data sharing attributes stack is empty");
D = getCanonicalDecl(D);
SharingMapTy &StackElem = getTopOfStack();
auto It = StackElem.NontemporalMap.find(D);
if (It == StackElem.NontemporalMap.end()) {
assert(NewDE && "Unexpected nullptr expr to be added into aligned map");
StackElem.NontemporalMap[D] = NewDE;
return nullptr;
assert(It->second && "Unexpected nullptr expr in the aligned map");
return It->second;
void DSAStackTy::addLoopControlVariable(const ValueDecl *D, VarDecl *Capture) {
assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
D = getCanonicalDecl(D);
SharingMapTy &StackElem = getTopOfStack();
D, LCDeclInfo(StackElem.LCVMap.size() + 1, Capture));
const DSAStackTy::LCDeclInfo
DSAStackTy::isLoopControlVariable(const ValueDecl *D) const {
assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
D = getCanonicalDecl(D);
const SharingMapTy &StackElem = getTopOfStack();
auto It = StackElem.LCVMap.find(D);
if (It != StackElem.LCVMap.end())
return It->second;
return {0, nullptr};
const DSAStackTy::LCDeclInfo
DSAStackTy::isLoopControlVariable(const ValueDecl *D, unsigned Level) const {
assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
D = getCanonicalDecl(D);
for (unsigned I = Level + 1; I > 0; --I) {
const SharingMapTy &StackElem = getStackElemAtLevel(I - 1);
auto It = StackElem.LCVMap.find(D);
if (It != StackElem.LCVMap.end())
return It->second;
return {0, nullptr};
const DSAStackTy::LCDeclInfo
DSAStackTy::isParentLoopControlVariable(const ValueDecl *D) const {
const SharingMapTy *Parent = getSecondOnStackOrNull();
assert(Parent && "Data-sharing attributes stack is empty");
D = getCanonicalDecl(D);
auto It = Parent->LCVMap.find(D);
if (It != Parent->LCVMap.end())
return It->second;
return {0, nullptr};
const ValueDecl *DSAStackTy::getParentLoopControlVariable(unsigned I) const {
const SharingMapTy *Parent = getSecondOnStackOrNull();
assert(Parent && "Data-sharing attributes stack is empty");
if (Parent->LCVMap.size() < I)
return nullptr;
for (const auto &Pair : Parent->LCVMap)
if (Pair.second.first == I)
return Pair.first;
return nullptr;
void DSAStackTy::addDSA(const ValueDecl *D, const Expr *E, OpenMPClauseKind A,
DeclRefExpr *PrivateCopy, unsigned Modifier) {
D = getCanonicalDecl(D);
if (A == OMPC_threadprivate) {
DSAInfo &Data = Threadprivates[D];
Data.Attributes = A;
Data.PrivateCopy = nullptr;
Data.Modifier = Modifier;
} else {
DSAInfo &Data = getTopOfStack().SharingMap[D];
assert(Data.Attributes == OMPC_unknown || (A == Data.Attributes) ||
(A == OMPC_firstprivate && Data.Attributes == OMPC_lastprivate) ||
(A == OMPC_lastprivate && Data.Attributes == OMPC_firstprivate) ||
(isLoopControlVariable(D).first && A == OMPC_private));
Data.Modifier = Modifier;
if (A == OMPC_lastprivate && Data.Attributes == OMPC_firstprivate) {
const bool IsLastprivate =
A == OMPC_lastprivate || Data.Attributes == OMPC_lastprivate;
Data.Attributes = A;
Data.RefExpr.setPointerAndInt(E, IsLastprivate);
Data.PrivateCopy = PrivateCopy;
if (PrivateCopy) {
DSAInfo &Data = getTopOfStack().SharingMap[PrivateCopy->getDecl()];
Data.Modifier = Modifier;
Data.Attributes = A;
Data.RefExpr.setPointerAndInt(PrivateCopy, IsLastprivate);
Data.PrivateCopy = nullptr;
/// Build a variable declaration for OpenMP loop iteration variable.
static VarDecl *buildVarDecl(Sema &SemaRef, SourceLocation Loc, QualType Type,
StringRef Name, const AttrVec *Attrs = nullptr,
DeclRefExpr *OrigRef = nullptr) {
DeclContext *DC = SemaRef.CurContext;
IdentifierInfo *II = &SemaRef.PP.getIdentifierTable().get(Name);
TypeSourceInfo *TInfo = SemaRef.Context.getTrivialTypeSourceInfo(Type, Loc);
auto *Decl =
VarDecl::Create(SemaRef.Context, DC, Loc, Loc, II, Type, TInfo, SC_None);
if (Attrs) {
for (specific_attr_iterator<AlignedAttr> I(Attrs->begin()), E(Attrs->end());
I != E; ++I)
if (OrigRef) {
OMPReferencedVarAttr::CreateImplicit(SemaRef.Context, OrigRef));
return Decl;
static DeclRefExpr *buildDeclRefExpr(Sema &S, VarDecl *D, QualType Ty,
SourceLocation Loc,
bool RefersToCapture = false) {
return DeclRefExpr::Create(S.getASTContext(), NestedNameSpecifierLoc(),
SourceLocation(), D, RefersToCapture, Loc, Ty,
void DSAStackTy::addTaskgroupReductionData(const ValueDecl *D, SourceRange SR,
BinaryOperatorKind BOK) {
D = getCanonicalDecl(D);
assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
getTopOfStack().SharingMap[D].Attributes == OMPC_reduction &&
"Additional reduction info may be specified only for reduction items.");
ReductionData &ReductionData = getTopOfStack().ReductionMap[D];
assert(ReductionData.ReductionRange.isInvalid() &&
(getTopOfStack().Directive == OMPD_taskgroup ||
((isOpenMPParallelDirective(getTopOfStack().Directive) ||
isOpenMPWorksharingDirective(getTopOfStack().Directive)) &&
!isOpenMPSimdDirective(getTopOfStack().Directive))) &&
"Additional reduction info may be specified only once for reduction "
ReductionData.set(BOK, SR);
Expr *&TaskgroupReductionRef =
if (!TaskgroupReductionRef) {
VarDecl *VD = buildVarDecl(SemaRef, SR.getBegin(),
SemaRef.Context.VoidPtrTy, ".task_red.");
TaskgroupReductionRef =
buildDeclRefExpr(SemaRef, VD, SemaRef.Context.VoidPtrTy, SR.getBegin());
void DSAStackTy::addTaskgroupReductionData(const ValueDecl *D, SourceRange SR,
const Expr *ReductionRef) {
D = getCanonicalDecl(D);
assert(!isStackEmpty() && "Data-sharing attributes stack is empty");
getTopOfStack().SharingMap[D].Attributes == OMPC_reduction &&
"Additional reduction info may be specified only for reduction items.");
ReductionData &ReductionData = getTopOfStack().ReductionMap[D];
assert(ReductionData.ReductionRange.isInvalid() &&
(getTopOfStack().Directive == OMPD_taskgroup ||
((isOpenMPParallelDirective(getTopOfStack().Directive) ||
isOpenMPWorksharingDirective(getTopOfStack().Directive)) &&
!isOpenMPSimdDirective(getTopOfStack().Directive))) &&
"Additional reduction info may be specified only once for reduction "
ReductionData.set(ReductionRef, SR);
Expr *&TaskgroupReductionRef =
if (!TaskgroupReductionRef) {
VarDecl *VD = buildVarDecl(SemaRef, SR.getBegin(),
SemaRef.Context.VoidPtrTy, ".task_red.");
TaskgroupReductionRef =
buildDeclRefExpr(SemaRef, VD, SemaRef.Context.VoidPtrTy, SR.getBegin());
const DSAStackTy::DSAVarData DSAStackTy::getTopMostTaskgroupReductionData(
const ValueDecl *D, SourceRange &SR, BinaryOperatorKind &BOK,
Expr *&TaskgroupDescriptor) const {
D = getCanonicalDecl(D);
assert(!isStackEmpty() && "Data-sharing attributes stack is empty.");
for (const_iterator I = begin() + 1, E = end(); I != E; ++I) {
const DSAInfo &Data = I->SharingMap.lookup(D);
if (Data.Attributes != OMPC_reduction ||
Data.Modifier != OMPC_REDUCTION_task)
const ReductionData &ReductionData = I->ReductionMap.lookup(D);
if (!ReductionData.ReductionOp ||<const Expr *>())
return DSAVarData();
SR = ReductionData.ReductionRange;
BOK = ReductionData.ReductionOp.get<ReductionData::BOKPtrType>();
assert(I->TaskgroupReductionRef && "taskgroup reduction reference "
"expression for the descriptor is not "
TaskgroupDescriptor = I->TaskgroupReductionRef;
return DSAVarData(I->Directive, OMPC_reduction, Data.RefExpr.getPointer(),
Data.PrivateCopy, I->DefaultAttrLoc, OMPC_REDUCTION_task);
return DSAVarData();
const DSAStackTy::DSAVarData DSAStackTy::getTopMostTaskgroupReductionData(
const ValueDecl *D, SourceRange &SR, const Expr *&ReductionRef,
Expr *&TaskgroupDescriptor) const {
D = getCanonicalDecl(D);
assert(!isStackEmpty() && "Data-sharing attributes stack is empty.");
for (const_iterator I = begin() + 1, E = end(); I != E; ++I) {
const DSAInfo &Data = I->SharingMap.lookup(D);
if (Data.Attributes != OMPC_reduction ||
Data.Modifier != OMPC_REDUCTION_task)
const ReductionData &ReductionData = I->ReductionMap.lookup(D);
if (!ReductionData.ReductionOp ||
!<const Expr *>())
return DSAVarData();
SR = ReductionData.ReductionRange;
ReductionRef = ReductionData.ReductionOp.get<const Expr *>();
assert(I->TaskgroupReductionRef && "taskgroup reduction reference "
"expression for the descriptor is not "
TaskgroupDescriptor = I->TaskgroupReductionRef;
return DSAVarData(I->Directive, OMPC_reduction, Data.RefExpr.getPointer(),
Data.PrivateCopy, I->DefaultAttrLoc, OMPC_REDUCTION_task);
return DSAVarData();
bool DSAStackTy::isOpenMPLocal(VarDecl *D, const_iterator I) const {
D = D->getCanonicalDecl();
for (const_iterator E = end(); I != E; ++I) {
if (isImplicitOrExplicitTaskingRegion(I->Directive) ||
isOpenMPTargetExecutionDirective(I->Directive)) {
Scope *TopScope = I->CurScope ? I->CurScope->getParent() : nullptr;
Scope *CurScope = getCurScope();
while (CurScope && CurScope != TopScope && !CurScope->isDeclScope(D))
CurScope = CurScope->getParent();
return CurScope != TopScope;
return false;
static bool isConstNotMutableType(Sema &SemaRef, QualType Type,
bool AcceptIfMutable = true,
bool *IsClassType = nullptr) {
ASTContext &Context = SemaRef.getASTContext();
Type = Type.getNonReferenceType().getCanonicalType();
bool IsConstant = Type.isConstant(Context);
Type = Context.getBaseElementType(Type);
const CXXRecordDecl *RD = AcceptIfMutable && SemaRef.getLangOpts().CPlusPlus
? Type->getAsCXXRecordDecl()
: nullptr;
if (const auto *CTSD = dyn_cast_or_null<ClassTemplateSpecializationDecl>(RD))
if (const ClassTemplateDecl *CTD = CTSD->getSpecializedTemplate())
RD = CTD->getTemplatedDecl();
if (IsClassType)
*IsClassType = RD;
return IsConstant && !(SemaRef.getLangOpts().CPlusPlus && RD &&
RD->hasDefinition() && RD->hasMutableFields());
static bool rejectConstNotMutableType(Sema &SemaRef, const ValueDecl *D,
QualType Type, OpenMPClauseKind CKind,
SourceLocation ELoc,
bool AcceptIfMutable = true,
bool ListItemNotVar = false) {
ASTContext &Context = SemaRef.getASTContext();
bool IsClassType;
if (isConstNotMutableType(SemaRef, Type, AcceptIfMutable, &IsClassType)) {
unsigned Diag = ListItemNotVar
? diag::err_omp_const_list_item
: IsClassType ? diag::err_omp_const_not_mutable_variable
: diag::err_omp_const_variable;
SemaRef.Diag(ELoc, Diag) << getOpenMPClauseName(CKind);
if (!ListItemNotVar && D) {
const VarDecl *VD = dyn_cast<VarDecl>(D);
bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
return true;
return false;
const DSAStackTy::DSAVarData DSAStackTy::getTopDSA(ValueDecl *D,
bool FromParent) {
D = getCanonicalDecl(D);
DSAVarData DVar;
auto *VD = dyn_cast<VarDecl>(D);
auto TI = Threadprivates.find(D);
if (TI != Threadprivates.end()) {
DVar.RefExpr = TI->getSecond().RefExpr.getPointer();
DVar.CKind = OMPC_threadprivate;
DVar.Modifier = TI->getSecond().Modifier;
return DVar;
if (VD && VD->hasAttr<OMPThreadPrivateDeclAttr>()) {
DVar.RefExpr = buildDeclRefExpr(
SemaRef, VD, D->getType().getNonReferenceType(),
DVar.CKind = OMPC_threadprivate;
addDSA(D, DVar.RefExpr, OMPC_threadprivate);
return DVar;
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, C/C++, predetermined, p.1]
// Variables appearing in threadprivate directives are threadprivate.
if ((VD && VD->getTLSKind() != VarDecl::TLS_None &&
!(VD->hasAttr<OMPThreadPrivateDeclAttr>() &&
SemaRef.getLangOpts().OpenMPUseTLS &&
SemaRef.getASTContext().getTargetInfo().isTLSSupported())) ||
(VD && VD->getStorageClass() == SC_Register &&
VD->hasAttr<AsmLabelAttr>() && !VD->isLocalVarDecl())) {
DVar.RefExpr = buildDeclRefExpr(
SemaRef, VD, D->getType().getNonReferenceType(), D->getLocation());
DVar.CKind = OMPC_threadprivate;
addDSA(D, DVar.RefExpr, OMPC_threadprivate);
return DVar;
if (SemaRef.getLangOpts().OpenMPCUDAMode && VD &&
VD->isLocalVarDeclOrParm() && !isStackEmpty() &&
!isLoopControlVariable(D).first) {
const_iterator IterTarget =
std::find_if(begin(), end(), [](const SharingMapTy &Data) {
return isOpenMPTargetExecutionDirective(Data.Directive);
if (IterTarget != end()) {
const_iterator ParentIterTarget = IterTarget + 1;
for (const_iterator Iter = begin();
Iter != ParentIterTarget; ++Iter) {
if (isOpenMPLocal(VD, Iter)) {
DVar.RefExpr =
buildDeclRefExpr(SemaRef, VD, D->getType().getNonReferenceType(),
DVar.CKind = OMPC_threadprivate;
return DVar;
if (!isClauseParsingMode() || IterTarget != begin()) {
auto DSAIter = IterTarget->SharingMap.find(D);
if (DSAIter != IterTarget->SharingMap.end() &&
isOpenMPPrivate(DSAIter->getSecond().Attributes)) {
DVar.RefExpr = DSAIter->getSecond().RefExpr.getPointer();
DVar.CKind = OMPC_threadprivate;
return DVar;
const_iterator End = end();
if (!SemaRef.isOpenMPCapturedByRef(
D, std::distance(ParentIterTarget, End),
/*OpenMPCaptureLevel=*/0)) {
DVar.RefExpr =
buildDeclRefExpr(SemaRef, VD, D->getType().getNonReferenceType(),
DVar.CKind = OMPC_threadprivate;
return DVar;
if (isStackEmpty())
// Not in OpenMP execution region and top scope was already checked.
return DVar;
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, C/C++, predetermined, p.4]
// Static data members are shared.
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, C/C++, predetermined, p.7]
// Variables with static storage duration that are declared in a scope
// inside the construct are shared.
if (VD && VD->isStaticDataMember()) {
// Check for explicitly specified attributes.
const_iterator I = begin();
const_iterator EndI = end();
if (FromParent && I != EndI)
if (I != EndI) {
auto It = I->SharingMap.find(D);
if (It != I->SharingMap.end()) {
const DSAInfo &Data = It->getSecond();
DVar.RefExpr = Data.RefExpr.getPointer();
DVar.PrivateCopy = Data.PrivateCopy;
DVar.CKind = Data.Attributes;
DVar.ImplicitDSALoc = I->DefaultAttrLoc;
DVar.DKind = I->Directive;
DVar.Modifier = Data.Modifier;
return DVar;
DVar.CKind = OMPC_shared;
return DVar;
auto &&MatchesAlways = [](OpenMPDirectiveKind) { return true; };
// The predetermined shared attribute for const-qualified types having no
// mutable members was removed after OpenMP 3.1.
if (SemaRef.LangOpts.OpenMP <= 31) {
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, C/C++, predetermined, p.6]
// Variables with const qualified type having no mutable member are
// shared.
if (isConstNotMutableType(SemaRef, D->getType())) {
// Variables with const-qualified type having no mutable member may be
// listed in a firstprivate clause, even if they are static data members.
DSAVarData DVarTemp = hasInnermostDSA(
[](OpenMPClauseKind C) {
return C == OMPC_firstprivate || C == OMPC_shared;
MatchesAlways, FromParent);
if (DVarTemp.CKind != OMPC_unknown && DVarTemp.RefExpr)
return DVarTemp;
DVar.CKind = OMPC_shared;
return DVar;
// Explicitly specified attributes and local variables with predetermined
// attributes.
const_iterator I = begin();
const_iterator EndI = end();
if (FromParent && I != EndI)
if (I == EndI)
return DVar;
auto It = I->SharingMap.find(D);
if (It != I->SharingMap.end()) {
const DSAInfo &Data = It->getSecond();
DVar.RefExpr = Data.RefExpr.getPointer();
DVar.PrivateCopy = Data.PrivateCopy;
DVar.CKind = Data.Attributes;
DVar.ImplicitDSALoc = I->DefaultAttrLoc;
DVar.DKind = I->Directive;
DVar.Modifier = Data.Modifier;
return DVar;
const DSAStackTy::DSAVarData DSAStackTy::getImplicitDSA(ValueDecl *D,
bool FromParent) const {
if (isStackEmpty()) {
const_iterator I;
return getDSA(I, D);
D = getCanonicalDecl(D);
const_iterator StartI = begin();
const_iterator EndI = end();
if (FromParent && StartI != EndI)
return getDSA(StartI, D);
const DSAStackTy::DSAVarData DSAStackTy::getImplicitDSA(ValueDecl *D,
unsigned Level) const {
if (getStackSize() <= Level)
return DSAVarData();
D = getCanonicalDecl(D);
const_iterator StartI = std::next(begin(), getStackSize() - 1 - Level);
return getDSA(StartI, D);
const DSAStackTy::DSAVarData
DSAStackTy::hasDSA(ValueDecl *D,
const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
bool FromParent) const {
if (isStackEmpty())
return {};
D = getCanonicalDecl(D);
const_iterator I = begin();
const_iterator EndI = end();
if (FromParent && I != EndI)
for (; I != EndI; ++I) {
if (!DPred(I->Directive) &&
const_iterator NewI = I;
DSAVarData DVar = getDSA(NewI, D);
if (I == NewI && CPred(DVar.CKind))
return DVar;
return {};
const DSAStackTy::DSAVarData DSAStackTy::hasInnermostDSA(
ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
bool FromParent) const {
if (isStackEmpty())
return {};
D = getCanonicalDecl(D);
const_iterator StartI = begin();
const_iterator EndI = end();
if (FromParent && StartI != EndI)
if (StartI == EndI || !DPred(StartI->Directive))
return {};
const_iterator NewI = StartI;
DSAVarData DVar = getDSA(NewI, D);
return (NewI == StartI && CPred(DVar.CKind)) ? DVar : DSAVarData();
bool DSAStackTy::hasExplicitDSA(
const ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
unsigned Level, bool NotLastprivate) const {
if (getStackSize() <= Level)
return false;
D = getCanonicalDecl(D);
const SharingMapTy &StackElem = getStackElemAtLevel(Level);
auto I = StackElem.SharingMap.find(D);
if (I != StackElem.SharingMap.end() &&
I->getSecond().RefExpr.getPointer() &&
CPred(I->getSecond().Attributes) &&
(!NotLastprivate || !I->getSecond().RefExpr.getInt()))
return true;
// Check predetermined rules for the loop control variables.
auto LI = StackElem.LCVMap.find(D);
if (LI != StackElem.LCVMap.end())
return CPred(OMPC_private);
return false;
bool DSAStackTy::hasExplicitDirective(
const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
unsigned Level) const {
if (getStackSize() <= Level)
return false;
const SharingMapTy &StackElem = getStackElemAtLevel(Level);
return DPred(StackElem.Directive);
bool DSAStackTy::hasDirective(
const llvm::function_ref<bool(OpenMPDirectiveKind,
const DeclarationNameInfo &, SourceLocation)>
bool FromParent) const {
// We look only in the enclosing region.
size_t Skip = FromParent ? 2 : 1;
for (const_iterator I = begin() + std::min(Skip, getStackSize()), E = end();
I != E; ++I) {
if (DPred(I->Directive, I->DirectiveName, I->ConstructLoc))
return true;
return false;
void Sema::InitDataSharingAttributesStack() {
VarDataSharingAttributesStack = new DSAStackTy(*this);
#define DSAStack static_cast<DSAStackTy *>(VarDataSharingAttributesStack)
void Sema::pushOpenMPFunctionRegion() {
void Sema::popOpenMPFunctionRegion(const FunctionScopeInfo *OldFSI) {
static bool isOpenMPDeviceDelayedContext(Sema &S) {
assert(S.LangOpts.OpenMP && S.LangOpts.OpenMPIsDevice &&
"Expected OpenMP device compilation.");
return !S.isInOpenMPTargetExecutionDirective() &&
namespace {
/// Status of the function emission on the host/device.
enum class FunctionEmissionStatus {
} // anonymous namespace
Sema::DeviceDiagBuilder Sema::diagIfOpenMPDeviceCode(SourceLocation Loc,
unsigned DiagID) {
assert(LangOpts.OpenMP && LangOpts.OpenMPIsDevice &&
"Expected OpenMP device compilation.");
FunctionDecl *FD = getCurFunctionDecl();
DeviceDiagBuilder::Kind Kind = DeviceDiagBuilder::K_Nop;
if (FD) {
FunctionEmissionStatus FES = getEmissionStatus(FD);
switch (FES) {
case FunctionEmissionStatus::Emitted:
Kind = DeviceDiagBuilder::K_Immediate;
case FunctionEmissionStatus::Unknown:
Kind = isOpenMPDeviceDelayedContext(*this)
? DeviceDiagBuilder::K_Deferred
: DeviceDiagBuilder::K_Immediate;
case FunctionEmissionStatus::TemplateDiscarded:
case FunctionEmissionStatus::OMPDiscarded:
Kind = DeviceDiagBuilder::K_Nop;
case FunctionEmissionStatus::CUDADiscarded:
llvm_unreachable("CUDADiscarded unexpected in OpenMP device compilation");
return DeviceDiagBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
Sema::DeviceDiagBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc,
unsigned DiagID) {
assert(LangOpts.OpenMP && !LangOpts.OpenMPIsDevice &&
"Expected OpenMP host compilation.");
FunctionEmissionStatus FES = getEmissionStatus(getCurFunctionDecl());
DeviceDiagBuilder::Kind Kind = DeviceDiagBuilder::K_Nop;
switch (FES) {
case FunctionEmissionStatus::Emitted:
Kind = DeviceDiagBuilder::K_Immediate;
case FunctionEmissionStatus::Unknown:
Kind = DeviceDiagBuilder::K_Deferred;
case FunctionEmissionStatus::TemplateDiscarded:
case FunctionEmissionStatus::OMPDiscarded:
case FunctionEmissionStatus::CUDADiscarded:
Kind = DeviceDiagBuilder::K_Nop;
return DeviceDiagBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
static OpenMPDefaultmapClauseKind
getVariableCategoryFromDecl(const LangOptions &LO, const ValueDecl *VD) {
if (LO.OpenMP <= 45) {
if (VD->getType().getNonReferenceType()->isScalarType())
return OMPC_DEFAULTMAP_scalar;
return OMPC_DEFAULTMAP_aggregate;
if (VD->getType().getNonReferenceType()->isAnyPointerType())
return OMPC_DEFAULTMAP_pointer;
if (VD->getType().getNonReferenceType()->isScalarType())
return OMPC_DEFAULTMAP_scalar;
return OMPC_DEFAULTMAP_aggregate;
bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
unsigned OpenMPCaptureLevel) const {
assert(LangOpts.OpenMP && "OpenMP is not allowed");
ASTContext &Ctx = getASTContext();
bool IsByRef = true;
// Find the directive that is associated with the provided scope.
D = cast<ValueDecl>(D->getCanonicalDecl());
QualType Ty = D->getType();
bool IsVariableUsedInMapClause = false;
if (DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective, Level)) {
// This table summarizes how a given variable should be passed to the device
// given its type and the clauses where it appears. This table is based on
// the description in OpenMP 4.5 [2.10.4, target Construct] and
// OpenMP 4.5 [2.15.5, Data-mapping Attribute Rules and Clauses].
// =========================================================================
// | type | defaultmap | pvt | first | is_device_ptr | map | res. |
// | |(tofrom:scalar)| | pvt | | | |
// =========================================================================
// | scl | | | | - | | bycopy|
// | scl | | - | x | - | - | bycopy|
// | scl | | x | - | - | - | null |
// | scl | x | | | - | | byref |
// | scl | x | - | x | - | - | bycopy|
// | scl | x | x | - | - | - | null |
// | scl | | - | - | - | x | byref |
// | scl | x | - | - | - | x | byref |
// | agg | n.a. | | | - | | byref |
// | agg | n.a. | - | x | - | - | byref |
// | agg | n.a. | x | - | - | - | null |
// | agg | n.a. | - | - | - | x | byref |
// | agg | n.a. | - | - | - | x[] | byref |
// | ptr | n.a. | | | - | | bycopy|
// | ptr | n.a. | - | x | - | - | bycopy|
// | ptr | n.a. | x | - | - | - | null |
// | ptr | n.a. | - | - | - | x | byref |
// | ptr | n.a. | - | - | - | x[] | bycopy|
// | ptr | n.a. | - | - | x | | bycopy|
// | ptr | n.a. | - | - | x | x | bycopy|
// | ptr | n.a. | - | - | x | x[] | bycopy|
// =========================================================================
// Legend:
// scl - scalar
// ptr - pointer
// agg - aggregate
// x - applies
// - - invalid in this combination
// [] - mapped with an array section
// byref - should be mapped by reference
// byval - should be mapped by value
// null - initialize a local variable to null on the device
// Observations:
// - All scalar declarations that show up in a map clause have to be passed
// by reference, because they may have been mapped in the enclosing data
// environment.
// - If the scalar value does not fit the size of uintptr, it has to be
// passed by reference, regardless the result in the table above.
// - For pointers mapped by value that have either an implicit map or an
// array section, the runtime library may pass the NULL value to the
// device instead of the value passed to it by the compiler.
if (Ty->isReferenceType())
Ty = Ty->castAs<ReferenceType>()->getPointeeType();
// Locate map clauses and see if the variable being captured is referred to
// in any of those clauses. Here we only care about variables, not fields,
// because fields are part of aggregates.
bool IsVariableAssociatedWithSection = false;
D, Level,
[&IsVariableUsedInMapClause, &IsVariableAssociatedWithSection, D](
OpenMPClauseKind WhereFoundClauseKind) {
// Only the map clause information influences how a variable is
// captured. E.g. is_device_ptr does not require changing the default
// behavior.
if (WhereFoundClauseKind != OMPC_map)
return false;
auto EI = MapExprComponents.rbegin();
auto EE = MapExprComponents.rend();
assert(EI != EE && "Invalid map expression!");
if (isa<DeclRefExpr>(EI->getAssociatedExpression()))
IsVariableUsedInMapClause |= EI->getAssociatedDeclaration() == D;
if (EI == EE)
return false;
if (isa<ArraySubscriptExpr>(EI->getAssociatedExpression()) ||
isa<OMPArraySectionExpr>(EI->getAssociatedExpression()) ||
isa<MemberExpr>(EI->getAssociatedExpression()) ||
isa<OMPArrayShapingExpr>(EI->getAssociatedExpression())) {
IsVariableAssociatedWithSection = true;
// There is nothing more we need to know about this variable.
return true;
// Keep looking for more map info.
return false;
if (IsVariableUsedInMapClause) {
// If variable is identified in a map clause it is always captured by
// reference except if it is a pointer that is dereferenced somehow.
IsByRef = !(Ty->isPointerType() && IsVariableAssociatedWithSection);
} else {
// By default, all the data that has a scalar type is mapped by copy
// (except for reduction variables).
// Defaultmap scalar is mutual exclusive to defaultmap pointer
IsByRef =
(DSAStack->isForceCaptureByReferenceInTargetExecutable() &&
!Ty->isAnyPointerType()) ||
!Ty->isScalarType() ||
Level, getVariableCategoryFromDecl(LangOpts, D)) ||
D, [](OpenMPClauseKind K) { return K == OMPC_reduction; }, Level);
if (IsByRef && Ty.getNonReferenceType()->isScalarType()) {
IsByRef =
((IsVariableUsedInMapClause &&
DSAStack->getCaptureRegion(Level, OpenMPCaptureLevel) ==
OMPD_target) ||
[](OpenMPClauseKind K) -> bool {
return K == OMPC_firstprivate;
Level, /*NotLastprivate=*/true) ||
DSAStack->isUsesAllocatorsDecl(Level, D))) &&
// If the variable is artificial and must be captured by value - try to
// capture by value.
!(isa<OMPCapturedExprDecl>(D) && !D->hasAttr<OMPCaptureNoInitAttr>() &&
!cast<OMPCapturedExprDecl>(D)->getInit()->isGLValue()) &&
// If the variable is implicitly firstprivate and scalar - capture by
// copy
!(DSAStack->getDefaultDSA() == DSA_firstprivate &&
D, [](OpenMPClauseKind K) { return K != OMPC_unknown; }, Level) &&
!DSAStack->isLoopControlVariable(D, Level).first);
// When passing data by copy, we need to make sure it fits the uintptr size
// and alignment, because the runtime library only deals with uintptr types.
// If it does not fit the uintptr size, we need to pass the data by reference
// instead.
if (!IsByRef &&
(Ctx.getTypeSizeInChars(Ty) >
Ctx.getTypeSizeInChars(Ctx.getUIntPtrType()) ||
Ctx.getDeclAlign(D) > Ctx.getTypeAlignInChars(Ctx.getUIntPtrType()))) {
IsByRef = true;
return IsByRef;
unsigned Sema::getOpenMPNestingLevel() const {
return DSAStack->getNestingLevel();
bool Sema::isInOpenMPTargetExecutionDirective() const {
return (isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) &&
!DSAStack->isClauseParsingMode()) ||
[](OpenMPDirectiveKind K, const DeclarationNameInfo &,
SourceLocation) -> bool {
return isOpenMPTargetExecutionDirective(K);
VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
unsigned StopAt) {
assert(LangOpts.OpenMP && "OpenMP is not allowed");
D = getCanonicalDecl(D);
auto *VD = dyn_cast<VarDecl>(D);
// Do not capture constexpr variables.
if (VD && VD->isConstexpr())
return nullptr;
// If we want to determine whether the variable should be captured from the
// perspective of the current capturing scope, and we've already left all the
// capturing scopes of the top directive on the stack, check from the
// perspective of its parent directive (if any) instead.
DSAStackTy::ParentDirectiveScope InParentDirectiveRAII(
*DSAStack, CheckScopeInfo && DSAStack->isBodyComplete());
// If we are attempting to capture a global variable in a directive with
// 'target' we return true so that this global is also mapped to the device.
if (VD && !VD->hasLocalStorage() &&
(getCurCapturedRegion() || getCurBlock() || getCurLambda())) {
if (isInOpenMPDeclareTargetContext()) {
// Try to mark variable as declare target if it is used in capturing
// regions.
if (LangOpts.OpenMP <= 45 &&
checkDeclIsAllowedInOpenMPTarget(nullptr, VD);
return nullptr;
} else if (isInOpenMPTargetExecutionDirective()) {
// If the declaration is enclosed in a 'declare target' directive,
// then it should not be captured.
if (OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
return nullptr;
CapturedRegionScopeInfo *CSI = nullptr;
for (FunctionScopeInfo *FSI : llvm::drop_begin(
CheckScopeInfo ? (FunctionScopes.size() - (StopAt + 1)) : 0)) {
if (!isa<CapturingScopeInfo>(FSI))
return nullptr;
if (auto *RSI = dyn_cast<CapturedRegionScopeInfo>(FSI))
if (RSI->CapRegionKind == CR_OpenMP) {
SmallVector<OpenMPDirectiveKind, 4> Regions;
if (Regions[CSI->OpenMPCaptureLevel] != OMPD_task)
return VD;
if (CheckScopeInfo) {
bool OpenMPFound = false;
for (unsigned I = StopAt + 1; I > 0; --I) {
FunctionScopeInfo *FSI = FunctionScopes[I - 1];
return nullptr;
if (auto *RSI = dyn_cast<CapturedRegionScopeInfo>(FSI))
if (RSI->CapRegionKind == CR_OpenMP) {
OpenMPFound = true;
if (!OpenMPFound)
return nullptr;
if (DSAStack->getCurrentDirective() != OMPD_unknown &&
(!DSAStack->isClauseParsingMode() ||
DSAStack->getParentDirective() != OMPD_unknown)) {
auto &&Info = DSAStack->isLoopControlVariable(D);
if (Info.first ||
(VD && VD->hasLocalStorage() &&
isImplicitOrExplicitTaskingRegion(DSAStack->getCurrentDirective())) ||
(VD && DSAStack->isForceVarCapturing()))
return VD ? VD : Info.second;
DSAStackTy::DSAVarData DVarTop =
DSAStack->getTopDSA(D, DSAStack->isClauseParsingMode());
if (DVarTop.CKind != OMPC_unknown && isOpenMPPrivate(DVarTop.CKind))
return VD ? VD : cast<VarDecl>(DVarTop.PrivateCopy->getDecl());
// Threadprivate variables must not be captured.
if (isOpenMPThreadPrivate(DVarTop.CKind))
return nullptr;
// The variable is not private or it is the variable in the directive with
// default(none) clause and not used in any clause.
DSAStackTy::DSAVarData DVarPrivate = DSAStack->hasDSA(
D, isOpenMPPrivate, [](OpenMPDirectiveKind) { return true; },
// Global shared must not be captured.
if (VD && !VD->hasLocalStorage() && DVarPrivate.CKind == OMPC_unknown &&
((DSAStack->getDefaultDSA() != DSA_none &&
DSAStack->getDefaultDSA() != DSA_firstprivate) ||
DVarTop.CKind == OMPC_shared))
return nullptr;
if (DVarPrivate.CKind != OMPC_unknown ||
(VD && (DSAStack->getDefaultDSA() == DSA_none ||
DSAStack->getDefaultDSA() == DSA_firstprivate)))
return VD ? VD : cast<VarDecl>(DVarPrivate.PrivateCopy->getDecl());
return nullptr;
void Sema::adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex,
unsigned Level) const {
FunctionScopesIndex -= getOpenMPCaptureLevels(DSAStack->getDirective(Level));
void Sema::startOpenMPLoop() {
assert(LangOpts.OpenMP && "OpenMP must be enabled.");
if (isOpenMPLoopDirective(DSAStack->getCurrentDirective()))
void Sema::startOpenMPCXXRangeFor() {
assert(LangOpts.OpenMP && "OpenMP must be enabled.");
if (isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
unsigned CapLevel) const {
assert(LangOpts.OpenMP && "OpenMP is not allowed");
if (DSAStack->hasExplicitDirective(
[](OpenMPDirectiveKind K) { return isOpenMPTaskingDirective(K); },
Level)) {
bool IsTriviallyCopyable =
- D->getType().getNonReferenceType().isTriviallyCopyableType(Context);
+ D->getType().getNonReferenceType().isTriviallyCopyableType(Context) &&
+ !D->getType()
+ .getNonReferenceType()
+ .getCanonicalType()
+ ->getAsCXXRecordDecl();
OpenMPDirectiveKind DKind = DSAStack->getDirective(Level);
SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, DKind);
if (isOpenMPTaskingDirective(CaptureRegions[CapLevel]) &&
(IsTriviallyCopyable ||
!isOpenMPTaskLoopDirective(CaptureRegions[CapLevel]))) {
if (DSAStack->hasExplicitDSA(
D, [](OpenMPClauseKind K) { return K == OMPC_firstprivate; },
Level, /*NotLastprivate=*/true))
return OMPC_firstprivate;
DSAStackTy::DSAVarData DVar = DSAStack->getImplicitDSA(D, Level);
if (DVar.CKind != OMPC_shared &&
!DSAStack->isLoopControlVariable(D, Level).first && !DVar.RefExpr) {
DSAStack->addImplicitTaskFirstprivate(Level, D);
return OMPC_firstprivate;
if (isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
if (DSAStack->getAssociatedLoops() > 0 &&
!DSAStack->isLoopStarted()) {
return OMPC_private;
if ((DSAStack->getPossiblyLoopCunter() == D->getCanonicalDecl() ||
DSAStack->isLoopControlVariable(D).first) &&
D, [](OpenMPClauseKind K) { return K != OMPC_private; }, Level) &&
return OMPC_private;
if (const auto *VD = dyn_cast<VarDecl>(D)) {
if (DSAStack->isThreadPrivate(const_cast<VarDecl *>(VD)) &&
DSAStack->isForceVarCapturing() &&
D, [](OpenMPClauseKind K) { return K == OMPC_copyin; }, Level))
return OMPC_private;
// User-defined allocators are private since they must be defined in the
// context of target region.
if (DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective, Level) &&
DSAStack->isUsesAllocatorsDecl(Level, D).getValueOr(
DSAStackTy::UsesAllocatorsDeclKind::AllocatorTrait) ==
return OMPC_private;
return (DSAStack->hasExplicitDSA(
D, [](OpenMPClauseKind K) { return K == OMPC_private; }, Level) ||
(DSAStack->isClauseParsingMode() &&
DSAStack->getClauseParsingMode() == OMPC_private) ||
// Consider taskgroup reduction descriptor variable a private
// to avoid possible capture in the region.
[](OpenMPDirectiveKind K) {
return K == OMPD_taskgroup ||
((isOpenMPParallelDirective(K) ||
isOpenMPWorksharingDirective(K)) &&
Level) &&
DSAStack->isTaskgroupReductionRef(D, Level)))
? OMPC_private
: OMPC_unknown;
void Sema::setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D,
unsigned Level) {
assert(LangOpts.OpenMP && "OpenMP is not allowed");
D = getCanonicalDecl(D);
OpenMPClauseKind OMPC = OMPC_unknown;
for (unsigned I = DSAStack->getNestingLevel() + 1; I > Level; --I) {
const unsigned NewLevel = I - 1;
if (DSAStack->hasExplicitDSA(D,
[&OMPC](const OpenMPClauseKind K) {
if (isOpenMPPrivate(K)) {
return true;
return false;
if (DSAStack->checkMappableExprComponentListsForDeclAtLevel(
D, NewLevel,
OpenMPClauseKind) { return true; })) {
OMPC = OMPC_map;
if (DSAStack->hasExplicitDirective(isOpenMPTargetExecutionDirective,
NewLevel)) {
OMPC = OMPC_map;
if (DSAStack->mustBeFirstprivateAtLevel(
NewLevel, getVariableCategoryFromDecl(LangOpts, D)))
OMPC = OMPC_firstprivate;
if (OMPC != OMPC_unknown)
FD->addAttr(OMPCaptureKindAttr::CreateImplicit(Context, unsigned(OMPC)));
bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
unsigned CaptureLevel) const {
assert(LangOpts.OpenMP && "OpenMP is not allowed");
// Return true if the current level is no longer enclosed in a target region.
SmallVector<OpenMPDirectiveKind, 4> Regions;
getOpenMPCaptureRegions(Regions, DSAStack->getDirective(Level));
const auto *VD = dyn_cast<VarDecl>(D);
return VD && !VD->hasLocalStorage() &&
Level) &&
Regions[CaptureLevel] != OMPD_task;
bool Sema::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level,
unsigned CaptureLevel) const {
assert(LangOpts.OpenMP && "OpenMP is not allowed");
// Return true if the current level is no longer enclosed in a target region.
if (const auto *VD = dyn_cast<VarDecl>(D)) {
if (!VD->hasLocalStorage()) {
DSAStackTy::DSAVarData TopDVar =
DSAStack->getTopDSA(D, /*FromParent=*/false);
unsigned NumLevels =
if (Level == 0)
return (NumLevels == CaptureLevel + 1) && TopDVar.CKind != OMPC_shared;
DSAStackTy::DSAVarData DVar = DSAStack->getImplicitDSA(D, Level - 1);
return DVar.CKind != OMPC_shared ||
D, Level - 1,
getOpenMPCaptureLevels(DSAStack->getDirective(Level - 1)) - 1);
return true;
void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; }
void Sema::ActOnOpenMPBeginDeclareVariant(SourceLocation Loc,
OMPTraitInfo &TI) {
if (!OMPDeclareVariantScopes.empty()) {
Diag(Loc, diag::warn_nested_declare_variant);
void Sema::ActOnOpenMPEndDeclareVariant() {
assert(isInOpenMPDeclareVariantScope() &&
"Not in OpenMP declare variant scope!");
void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
const FunctionDecl *Callee,
SourceLocation Loc) {
assert(LangOpts.OpenMP && "Expected OpenMP compilation mode.");
Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
// Ignore host functions during device analyzis.
if (LangOpts.OpenMPIsDevice && DevTy &&
*DevTy == OMPDeclareTargetDeclAttr::DT_Host)
// Ignore nohost functions during host analyzis.
if (!LangOpts.OpenMPIsDevice && DevTy &&
*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost)
const FunctionDecl *FD = Callee->getMostRecentDecl();
DevTy = OMPDeclareTargetDeclAttr::getDeviceType(FD);
if (LangOpts.OpenMPIsDevice && DevTy &&
*DevTy == OMPDeclareTargetDeclAttr::DT_Host) {
// Diagnose host function called during device codegen.
StringRef HostDevTy =
getOpenMPSimpleClauseTypeName(OMPC_device_type, OMPC_DEVICE_TYPE_host);
Diag(Loc, diag::err_omp_wrong_device_function_call) << HostDevTy << 0;
<< HostDevTy;
if (!LangOpts.OpenMPIsDevice && DevTy &&
*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
// Diagnose nohost function called during host codegen.
StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName(
OMPC_device_type, OMPC_DEVICE_TYPE_nohost);
Diag(Loc, diag::err_omp_wrong_device_function_call) << NoHostDevTy << 1;
<< NoHostDevTy;
void Sema::StartOpenMPDSABlock(OpenMPDirectiveKind DKind,
const DeclarationNameInfo &DirName,
Scope *CurScope, SourceLocation Loc) {
DSAStack->push(DKind, DirName, CurScope, Loc);
void Sema::StartOpenMPClause(OpenMPClauseKind K) {
void Sema::EndOpenMPClause() {
static std::pair<ValueDecl *, bool>
getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc,
SourceRange &ERange, bool AllowArraySection = false);
/// Check consistency of the reduction clauses.
static void checkReductionClauses(Sema &S, DSAStackTy *Stack,
ArrayRef<OMPClause *> Clauses) {
bool InscanFound = false;
SourceLocation InscanLoc;
// OpenMP 5.0, reduction Clause, Restrictions.
// A reduction clause without the inscan reduction-modifier may not appear on
// a construct on which a reduction clause with the inscan reduction-modifier
// appears.
for (OMPClause *C : Clauses) {
if (C->getClauseKind() != OMPC_reduction)
auto *RC = cast<OMPReductionClause>(C);
if (RC->getModifier() == OMPC_REDUCTION_inscan) {
InscanFound = true;
InscanLoc = RC->getModifierLoc();
if (RC->getModifier() == OMPC_REDUCTION_task) {
// OpenMP 5.0, reduction Clause.
// A reduction clause with the task reduction-modifier may only appear on
// a parallel construct, a worksharing construct or a combined or
// composite construct for which any of the aforementioned constructs is a
// constituent construct and simd or loop are not constituent constructs.
OpenMPDirectiveKind CurDir = Stack->getCurrentDirective();
if (!(isOpenMPParallelDirective(CurDir) ||
isOpenMPWorksharingDirective(CurDir)) ||
if (InscanFound) {
for (OMPClause *C : Clauses) {
if (C->getClauseKind() != OMPC_reduction)
auto *RC = cast<OMPReductionClause>(C);
if (RC->getModifier() != OMPC_REDUCTION_inscan) {
S.Diag(RC->getModifier() == OMPC_REDUCTION_unknown
? RC->getBeginLoc()
: RC->getModifierLoc(),
S.Diag(InscanLoc, diag::note_omp_previous_inscan_reduction);
for (Expr *Ref : RC->varlists()) {
assert(Ref && "NULL expr in OpenMP nontemporal clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = Ref;
auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange,
ValueDecl *D = Res.first;
if (!D)
if (!Stack->isUsedInScanDirective(getCanonicalDecl(D))) {
<< Ref->getSourceRange();
static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
ArrayRef<OMPClause *> Clauses);
static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr,
bool WithInit);
static void reportOriginalDsa(Sema &SemaRef, const DSAStackTy *Stack,
const ValueDecl *D,
const DSAStackTy::DSAVarData &DVar,
bool IsLoopIterVar = false);
void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
// OpenMP [, Restrictions, C/C++, p.1]
// A variable of class type (or array thereof) that appears in a lastprivate
// clause requires an accessible, unambiguous default constructor for the
// class type, unless the list item is also specified in a firstprivate
// clause.
if (const auto *D = dyn_cast_or_null<OMPExecutableDirective>(CurDirective)) {
for (OMPClause *C : D->clauses()) {
if (auto *Clause = dyn_cast<OMPLastprivateClause>(C)) {
SmallVector<Expr *, 8> PrivateCopies;
for (Expr *DE : Clause->varlists()) {
if (DE->isValueDependent() || DE->isTypeDependent()) {
auto *DRE = cast<DeclRefExpr>(DE->IgnoreParens());
auto *VD = cast<VarDecl>(DRE->getDecl());
QualType Type = VD->getType().getNonReferenceType();
const DSAStackTy::DSAVarData DVar =
DSAStack->getTopDSA(VD, /*FromParent=*/false);
if (DVar.CKind == OMPC_lastprivate) {
// Generate helper private variable and initialize it with the
// default value. The address of the original variable is replaced
// by the address of the new private variable in CodeGen. This new
// variable is not added to IdResolver, so the code in the OpenMP
// region uses original variable for proper diagnostics.
VarDecl *VDPrivate = buildVarDecl(
*this, DE->getExprLoc(), Type.getUnqualifiedType(),
VD->getName(), VD->hasAttrs() ? &VD->getAttrs() : nullptr, DRE);
if (VDPrivate->isInvalidDecl()) {
*this, VDPrivate, DE->getType(), DE->getExprLoc()));
} else {
// The variable is also a firstprivate, so initialization sequence
// for private copy is generated already.
// Finalize nontemporal clause by handling private copies, if any.
if (auto *Clause = dyn_cast<OMPNontemporalClause>(C)) {
SmallVector<Expr *, 8> PrivateRefs;
for (Expr *RefExpr : Clause->varlists()) {
assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second)
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
const DSAStackTy::DSAVarData DVar =
DSAStack->getTopDSA(D, /*FromParent=*/false);
PrivateRefs.push_back(DVar.PrivateCopy ? DVar.PrivateCopy
: SimpleRefExpr);
if (auto *Clause = dyn_cast<OMPUsesAllocatorsClause>(C)) {
for (unsigned I = 0, E = Clause->getNumberOfAllocators(); I < E; ++I) {
OMPUsesAllocatorsClause::Data D = Clause->getAllocatorData(I);
auto *DRE = dyn_cast<DeclRefExpr>(D.Allocator->IgnoreParenImpCasts());
if (!DRE)
ValueDecl *VD = DRE->getDecl();
if (!VD || !isa<VarDecl>(VD))
DSAStackTy::DSAVarData DVar =
DSAStack->getTopDSA(VD, /*FromParent=*/false);
// OpenMP [2.12.5, target Construct]
// Memory allocators that appear in a uses_allocators clause cannot
// appear in other data-sharing attribute clauses or data-mapping
// attribute clauses in the same construct.
Expr *MapExpr = nullptr;
if (DVar.RefExpr ||
VD, /*CurrentRegionOnly=*/true,
[VD, &MapExpr](
OpenMPClauseKind C) {
auto MI = MapExprComponents.rbegin();
auto ME = MapExprComponents.rend();
if (MI != ME &&
MI->getAssociatedDeclaration()->getCanonicalDecl() ==
VD->getCanonicalDecl()) {
MapExpr = MI->getAssociatedExpression();
return true;
return false;
})) {
<< D.Allocator->getSourceRange();
if (DVar.RefExpr)
reportOriginalDsa(*this, DSAStack, VD, DVar);
Diag(MapExpr->getExprLoc(), diag::note_used_here)
<< MapExpr->getSourceRange();
// Check allocate clauses.
if (!CurContext->isDependentContext())
checkAllocateClauses(*this, DSAStack, D->clauses());
checkReductionClauses(*this, DSAStack, D->clauses());
static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
Expr *NumIterations, Sema &SemaRef,
Scope *S, DSAStackTy *Stack);
namespace {
class VarDeclFilterCCC final : public CorrectionCandidateCallback {
Sema &SemaRef;
explicit VarDeclFilterCCC(Sema &S) : SemaRef(S) {}
bool ValidateCandidate(const TypoCorrection &Candidate) override {
NamedDecl *ND = Candidate.getCorrectionDecl();
if (const auto *VD = dyn_cast_or_null<VarDecl>(ND)) {
return VD->hasGlobalStorage() &&
SemaRef.isDeclInScope(ND, SemaRef.getCurLexicalContext(),
return false;
std::unique_ptr<CorrectionCandidateCallback> clone() override {
return std::make_unique<VarDeclFilterCCC>(*this);
class VarOrFuncDeclFilterCCC final : public CorrectionCandidateCallback {
Sema &SemaRef;
explicit VarOrFuncDeclFilterCCC(Sema &S) : SemaRef(S) {}
bool ValidateCandidate(const TypoCorrection &Candidate) override {
NamedDecl *ND = Candidate.getCorrectionDecl();
if (ND && ((isa<VarDecl>(ND) && ND->getKind() == Decl::Var) ||
isa<FunctionDecl>(ND))) {
return SemaRef.isDeclInScope(ND, SemaRef.getCurLexicalContext(),
return false;
std::unique_ptr<CorrectionCandidateCallback> clone() override {
return std::make_unique<VarOrFuncDeclFilterCCC>(*this);
} // namespace
ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
CXXScopeSpec &ScopeSpec,
const DeclarationNameInfo &Id,
OpenMPDirectiveKind Kind) {
LookupResult Lookup(*this, Id, LookupOrdinaryName);
LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
if (Lookup.isAmbiguous())
return ExprError();
VarDecl *VD;
if (!Lookup.isSingleResult()) {
VarDeclFilterCCC CCC(*this);
if (TypoCorrection Corrected =
CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr, CCC,
CTK_ErrorRecovery)) {
? diag::err_undeclared_var_use_suggest
: diag::err_omp_expected_var_arg_suggest)
<< Id.getName());
VD = Corrected.getCorrectionDeclAs<VarDecl>();
} else {
Diag(Id.getLoc(), Lookup.empty() ? diag::err_undeclared_var_use
: diag::err_omp_expected_var_arg)
<< Id.getName();
return ExprError();
} else if (!(VD = Lookup.getAsSingle<VarDecl>())) {
Diag(Id.getLoc(), diag::err_omp_expected_var_arg) << Id.getName();
Diag(Lookup.getFoundDecl()->getLocation(), diag::note_declared_at);
return ExprError();
// OpenMP [2.9.2, Syntax, C/C++]
// Variables must be file-scope, namespace-scope, or static block-scope.
if (Kind == OMPD_threadprivate && !VD->hasGlobalStorage()) {
Diag(Id.getLoc(), diag::err_omp_global_var_arg)
<< getOpenMPDirectiveName(Kind) << !VD->isStaticLocal();
bool IsDecl =
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< VD;
return ExprError();
VarDecl *CanonicalVD = VD->getCanonicalDecl();
NamedDecl *ND = CanonicalVD;
// OpenMP [2.9.2, Restrictions, C/C++, p.2]
// A threadprivate directive for file-scope variables must appear outside
// any definition or declaration.
if (CanonicalVD->getDeclContext()->isTranslationUnit() &&
!getCurLexicalContext()->isTranslationUnit()) {
Diag(Id.getLoc(), diag::err_omp_var_scope)
<< getOpenMPDirectiveName(Kind) << VD;
bool IsDecl =
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< VD;
return ExprError();
// OpenMP [2.9.2, Restrictions, C/C++, p.3]
// A threadprivate directive for static class member variables must appear
// in the class definition, in the same scope in which the member
// variables are declared.
if (CanonicalVD->isStaticDataMember() &&
!CanonicalVD->getDeclContext()->Equals(getCurLexicalContext())) {
Diag(Id.getLoc(), diag::err_omp_var_scope)
<< getOpenMPDirectiveName(Kind) << VD;
bool IsDecl =
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< VD;
return ExprError();
// OpenMP [2.9.2, Restrictions, C/C++, p.4]
// A threadprivate directive for namespace-scope variables must appear
// outside any definition or declaration other than the namespace
// definition itself.
if (CanonicalVD->getDeclContext()->isNamespace() &&
(!getCurLexicalContext()->isFileContext() ||
!getCurLexicalContext()->Encloses(CanonicalVD->getDeclContext()))) {
Diag(Id.getLoc(), diag::err_omp_var_scope)
<< getOpenMPDirectiveName(Kind) << VD;
bool IsDecl =
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< VD;
return ExprError();
// OpenMP [2.9.2, Restrictions, C/C++, p.6]
// A threadprivate directive for static block-scope variables must appear
// in the scope of the variable and not in a nested scope.
if (CanonicalVD->isLocalVarDecl() && CurScope &&
!isDeclInScope(ND, getCurLexicalContext(), CurScope)) {
Diag(Id.getLoc(), diag::err_omp_var_scope)
<< getOpenMPDirectiveName(Kind) << VD;
bool IsDecl =
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< VD;
return ExprError();
// OpenMP [2.9.2, Restrictions, C/C++, p.2-6]
// A threadprivate directive must lexically precede all references to any
// of the variables in its list.
if (Kind == OMPD_threadprivate && VD->isUsed() &&
!DSAStack->isThreadPrivate(VD)) {
Diag(Id.getLoc(), diag::err_omp_var_used)
<< getOpenMPDirectiveName(Kind) << VD;
return ExprError();
QualType ExprType = VD->getType().getNonReferenceType();
return DeclRefExpr::Create(Context, NestedNameSpecifierLoc(),
SourceLocation(), VD,
Id.getLoc(), ExprType, VK_LValue);
Sema::ActOnOpenMPThreadprivateDirective(SourceLocation Loc,
ArrayRef<Expr *> VarList) {
if (OMPThreadPrivateDecl *D = CheckOMPThreadPrivateDecl(Loc, VarList)) {
return DeclGroupPtrTy::make(DeclGroupRef(D));
return nullptr;
namespace {
class LocalVarRefChecker final
: public ConstStmtVisitor<LocalVarRefChecker, bool> {
Sema &SemaRef;
bool VisitDeclRefExpr(const DeclRefExpr *E) {
if (const auto *VD = dyn_cast<VarDecl>(E->getDecl())) {
if (VD->hasLocalStorage()) {
<< E->getSourceRange();
SemaRef.Diag(VD->getLocation(), diag::note_defined_here)
<< VD << VD->getSourceRange();
return true;
return false;
bool VisitStmt(const Stmt *S) {
for (const Stmt *Child : S->children()) {
if (Child && Visit(Child))
return true;
return false;
explicit LocalVarRefChecker(Sema &SemaRef) : SemaRef(SemaRef) {}
} // namespace
OMPThreadPrivateDecl *
Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef<Expr *> VarList) {
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : VarList) {
auto *DE = cast<DeclRefExpr>(RefExpr);
auto *VD = cast<VarDecl>(DE->getDecl());
SourceLocation ILoc = DE->getExprLoc();
// Mark variable as used.
QualType QType = VD->getType();
if (QType->isDependentType() || QType->isInstantiationDependentType()) {
// It will be analyzed later.
// OpenMP [2.9.2, Restrictions, C/C++, p.10]
// A threadprivate variable must not have an incomplete type.
if (RequireCompleteType(ILoc, VD->getType(),
diag::err_omp_threadprivate_incomplete_type)) {
// OpenMP [2.9.2, Restrictions, C/C++, p.10]
// A threadprivate variable must not have a reference type.
if (VD->getType()->isReferenceType()) {
Diag(ILoc, diag::err_omp_ref_type_arg)
<< getOpenMPDirectiveName(OMPD_threadprivate) << VD->getType();
bool IsDecl =
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< VD;
// Check if this is a TLS variable. If TLS is not being supported, produce
// the corresponding diagnostic.
if ((VD->getTLSKind() != VarDecl::TLS_None &&
!(VD->hasAttr<OMPThreadPrivateDeclAttr>() &&
getLangOpts().OpenMPUseTLS &&
getASTContext().getTargetInfo().isTLSSupported())) ||
(VD->getStorageClass() == SC_Register && VD->hasAttr<AsmLabelAttr>() &&
!VD->isLocalVarDecl())) {
Diag(ILoc, diag::err_omp_var_thread_local)
<< VD << ((VD->getTLSKind() != VarDecl::TLS_None) ? 0 : 1);
bool IsDecl =
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< VD;
// Check if initial value of threadprivate variable reference variable with
// local storage (it is not supported by runtime).
if (const Expr *Init = VD->getAnyInitializer()) {
LocalVarRefChecker Checker(*this);
if (Checker.Visit(Init))
DSAStack->addDSA(VD, DE, OMPC_threadprivate);
Context, SourceRange(Loc, Loc)));
if (ASTMutationListener *ML = Context.getASTMutationListener())
OMPThreadPrivateDecl *D = nullptr;
if (!Vars.empty()) {
D = OMPThreadPrivateDecl::Create(Context, getCurLexicalContext(), Loc,
return D;
static OMPAllocateDeclAttr::AllocatorTypeTy
getAllocatorKind(Sema &S, DSAStackTy *Stack, Expr *Allocator) {
if (!Allocator)
return OMPAllocateDeclAttr::OMPNullMemAlloc;
if (Allocator->isTypeDependent() || Allocator->isValueDependent() ||
Allocator->isInstantiationDependent() ||
return OMPAllocateDeclAttr::OMPUserDefinedMemAlloc;
auto AllocatorKindRes = OMPAllocateDeclAttr::OMPUserDefinedMemAlloc;
const Expr *AE = Allocator->IgnoreParenImpCasts();
for (int I = 0; I < OMPAllocateDeclAttr::OMPUserDefinedMemAlloc; ++I) {
auto AllocatorKind = static_cast<OMPAllocateDeclAttr::AllocatorTypeTy>(I);
const Expr *DefAllocator = Stack->getAllocator(AllocatorKind);
llvm::FoldingSetNodeID AEId, DAEId;
AE->Profile(AEId, S.getASTContext(), /*Canonical=*/true);
DefAllocator->Profile(DAEId, S.getASTContext(), /*Canonical=*/true);
if (AEId == DAEId) {
AllocatorKindRes = AllocatorKind;
return AllocatorKindRes;
static bool checkPreviousOMPAllocateAttribute(
Sema &S, DSAStackTy *Stack, Expr *RefExpr, VarDecl *VD,
OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind, Expr *Allocator) {
if (!VD->hasAttr<OMPAllocateDeclAttr>())
return false;
const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
Expr *PrevAllocator = A->getAllocator();
OMPAllocateDeclAttr::AllocatorTypeTy PrevAllocatorKind =
getAllocatorKind(S, Stack, PrevAllocator);
bool AllocatorsMatch = AllocatorKind == PrevAllocatorKind;
if (AllocatorsMatch &&
AllocatorKind == OMPAllocateDeclAttr::OMPUserDefinedMemAlloc &&
Allocator && PrevAllocator) {
const Expr *AE = Allocator->IgnoreParenImpCasts();
const Expr *PAE = PrevAllocator->IgnoreParenImpCasts();
llvm::FoldingSetNodeID AEId, PAEId;
AE->Profile(AEId, S.Context, /*Canonical=*/true);
PAE->Profile(PAEId, S.Context, /*Canonical=*/true);
AllocatorsMatch = AEId == PAEId;
if (!AllocatorsMatch) {
SmallString<256> AllocatorBuffer;
llvm::raw_svector_ostream AllocatorStream(AllocatorBuffer);
if (Allocator)
Allocator->printPretty(AllocatorStream, nullptr, S.getPrintingPolicy());
SmallString<256> PrevAllocatorBuffer;
llvm::raw_svector_ostream PrevAllocatorStream(PrevAllocatorBuffer);
if (PrevAllocator)
PrevAllocator->printPretty(PrevAllocatorStream, nullptr,
SourceLocation AllocatorLoc =
Allocator ? Allocator->getExprLoc() : RefExpr->getExprLoc();
SourceRange AllocatorRange =
Allocator ? Allocator->getSourceRange() : RefExpr->getSourceRange();
SourceLocation PrevAllocatorLoc =
PrevAllocator ? PrevAllocator->getExprLoc() : A->getLocation();
SourceRange PrevAllocatorRange =
PrevAllocator ? PrevAllocator->getSourceRange() : A->getRange();
S.Diag(AllocatorLoc, diag::warn_omp_used_different_allocator)
<< (Allocator ? 1 : 0) << AllocatorStream.str()
<< (PrevAllocator ? 1 : 0) << PrevAllocatorStream.str()
<< AllocatorRange;
S.Diag(PrevAllocatorLoc, diag::note_omp_previous_allocator)
<< PrevAllocatorRange;
return true;
return false;
static void
applyOMPAllocateAttribute(Sema &S, VarDecl *VD,
OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind,
Expr *Allocator, SourceRange SR) {
if (VD->hasAttr<OMPAllocateDeclAttr>())
if (Allocator &&
(Allocator->isTypeDependent() || Allocator->isValueDependent() ||
Allocator->isInstantiationDependent() ||
auto *A = OMPAllocateDeclAttr::CreateImplicit(S.Context, AllocatorKind,
Allocator, SR);
if (ASTMutationListener *ML = S.Context.getASTMutationListener())
ML->DeclarationMarkedOpenMPAllocate(VD, A);
Sema::DeclGroupPtrTy Sema::ActOnOpenMPAllocateDirective(
SourceLocation Loc, ArrayRef<Expr *> VarList,
ArrayRef<OMPClause *> Clauses, DeclContext *Owner) {
assert(Clauses.size() <= 1 && "Expected at most one clause.");
Expr *Allocator = nullptr;
if (Clauses.empty()) {
// OpenMP 5.0, 2.11.3 allocate Directive, Restrictions.
// allocate directives that appear in a target region must specify an
// allocator clause unless a requires directive with the dynamic_allocators
// clause is present in the same compilation unit.
if (LangOpts.OpenMPIsDevice &&
targetDiag(Loc, diag::err_expected_allocator_clause);
} else {
Allocator = cast<OMPAllocatorClause>(Clauses.back())->getAllocator();
OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind =
getAllocatorKind(*this, DSAStack, Allocator);
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : VarList) {
auto *DE = cast<DeclRefExpr>(RefExpr);
auto *VD = cast<VarDecl>(DE->getDecl());
// Check if this is a TLS variable or global register.
if (VD->getTLSKind() != VarDecl::TLS_None ||
VD->hasAttr<OMPThreadPrivateDeclAttr>() ||
(VD->getStorageClass() == SC_Register && VD->hasAttr<AsmLabelAttr>() &&
// If the used several times in the allocate directive, the same allocator
// must be used.
if (checkPreviousOMPAllocateAttribute(*this, DSAStack, RefExpr, VD,
AllocatorKind, Allocator))
// OpenMP, 2.11.3 allocate Directive, Restrictions, C / C++
// If a list item has a static storage type, the allocator expression in the
// allocator clause must be a constant expression that evaluates to one of
// the predefined memory allocator values.
if (Allocator && VD->hasGlobalStorage()) {
if (AllocatorKind == OMPAllocateDeclAttr::OMPUserDefinedMemAlloc) {
<< Allocator->getSourceRange();
bool IsDecl = VD->isThisDeclarationADefinition(Context) ==
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< VD;
applyOMPAllocateAttribute(*this, VD, AllocatorKind, Allocator,
if (Vars.empty())
return nullptr;
if (!Owner)
Owner = getCurLexicalContext();
auto *D = OMPAllocateDecl::Create(Context, Owner, Loc, Vars, Clauses);
return DeclGroupPtrTy::make(DeclGroupRef(D));
Sema::ActOnOpenMPRequiresDirective(SourceLocation Loc,
ArrayRef<OMPClause *> ClauseList) {
OMPRequiresDecl *D = nullptr;
if (!CurContext->isFileContext()) {
Diag(Loc, diag::err_omp_invalid_scope) << "requires";
} else {
D = CheckOMPRequiresDecl(Loc, ClauseList);
if (D) {
return DeclGroupPtrTy::make(DeclGroupRef(D));
OMPRequiresDecl *Sema::CheckOMPRequiresDecl(SourceLocation Loc,
ArrayRef<OMPClause *> ClauseList) {
/// For target specific clauses, the requires directive cannot be
/// specified after the handling of any of the target regions in the
/// current compilation unit.
ArrayRef<SourceLocation> TargetLocations =
SourceLocation AtomicLoc = DSAStack->getAtomicDirectiveLoc();
if (!TargetLocations.empty() || !AtomicLoc.isInvalid()) {
for (const OMPClause *CNew : ClauseList) {
// Check if any of the requires clauses affect target regions.
if (isa<OMPUnifiedSharedMemoryClause>(CNew) ||
isa<OMPUnifiedAddressClause>(CNew) ||
isa<OMPReverseOffloadClause>(CNew) ||
isa<OMPDynamicAllocatorsClause>(CNew)) {
Diag(Loc, diag::err_omp_directive_before_requires)
<< "target" << getOpenMPClauseName(CNew->getClauseKind());
for (SourceLocation TargetLoc : TargetLocations) {
Diag(TargetLoc, diag::note_omp_requires_encountered_directive)
<< "target";
} else if (!AtomicLoc.isInvalid() &&
isa<OMPAtomicDefaultMemOrderClause>(CNew)) {
Diag(Loc, diag::err_omp_directive_before_requires)
<< "atomic" << getOpenMPClauseName(CNew->getClauseKind());
Diag(AtomicLoc, diag::note_omp_requires_encountered_directive)
<< "atomic";
if (!DSAStack->hasDuplicateRequiresClause(ClauseList))
return OMPRequiresDecl::Create(Context, getCurLexicalContext(), Loc,
return nullptr;
static void reportOriginalDsa(Sema &SemaRef, const DSAStackTy *Stack,
const ValueDecl *D,
const DSAStackTy::DSAVarData &DVar,
bool IsLoopIterVar) {
if (DVar.RefExpr) {
SemaRef.Diag(DVar.RefExpr->getExprLoc(), diag::note_omp_explicit_dsa)
<< getOpenMPClauseName(DVar.CKind);
enum {
} Reason = PDSA_Implicit;
bool ReportHint = false;
auto ReportLoc = D->getLocation();
auto *VD = dyn_cast<VarDecl>(D);
if (IsLoopIterVar) {
if (DVar.CKind == OMPC_private)
Reason = PDSA_LoopIterVarPrivate;
else if (DVar.CKind == OMPC_lastprivate)
Reason = PDSA_LoopIterVarLastprivate;
Reason = PDSA_LoopIterVarLinear;
} else if (isOpenMPTaskingDirective(DVar.DKind) &&
DVar.CKind == OMPC_firstprivate) {
Reason = PDSA_TaskVarFirstprivate;
ReportLoc = DVar.ImplicitDSALoc;
} else if (VD && VD->isStaticLocal())
Reason = PDSA_StaticLocalVarShared;
else if (VD && VD->isStaticDataMember())
Reason = PDSA_StaticMemberShared;
else if (VD && VD->isFileVarDecl())
Reason = PDSA_GlobalVarShared;
else if (D->getType().isConstant(SemaRef.getASTContext()))
Reason = PDSA_ConstVarShared;
else if (VD && VD->isLocalVarDecl() && DVar.CKind == OMPC_private) {
ReportHint = true;
Reason = PDSA_LocalVarPrivate;
if (Reason != PDSA_Implicit) {
SemaRef.Diag(ReportLoc, diag::note_omp_predetermined_dsa)
<< Reason << ReportHint
<< getOpenMPDirectiveName(Stack->getCurrentDirective());
} else if (DVar.ImplicitDSALoc.isValid()) {
SemaRef.Diag(DVar.ImplicitDSALoc, diag::note_omp_implicit_dsa)
<< getOpenMPClauseName(DVar.CKind);
static OpenMPMapClauseKind
getMapClauseKindFromModifier(OpenMPDefaultmapClauseModifier M,
bool IsAggregateOrDeclareTarget) {
OpenMPMapClauseKind Kind = OMPC_MAP_unknown;
switch (M) {
Kind = OMPC_MAP_alloc;
Kind = OMPC_MAP_to;
Kind = OMPC_MAP_from;
Kind = OMPC_MAP_tofrom;
llvm_unreachable("Unexpected defaultmap implicit behavior");
// IsAggregateOrDeclareTarget could be true if:
// 1. the implicit behavior for aggregate is tofrom
// 2. it's a declare target link
if (IsAggregateOrDeclareTarget) {
Kind = OMPC_MAP_tofrom;
llvm_unreachable("Unexpected defaultmap implicit behavior");
assert(Kind != OMPC_MAP_unknown && "Expect map kind to be known");
return Kind;
namespace {
class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
DSAStackTy *Stack;
Sema &SemaRef;
bool ErrorFound = false;
bool TryCaptureCXXThisMembers = false;
CapturedStmt *CS = nullptr;
llvm::SmallVector<Expr *, 4> ImplicitFirstprivate;
llvm::SmallVector<Expr *, 4> ImplicitMap[OMPC_MAP_delete];
Sema::VarsWithInheritedDSAType VarsWithInheritedDSA;
llvm::SmallDenseSet<const ValueDecl *, 4> ImplicitDeclarations;
void VisitSubCaptures(OMPExecutableDirective *S) {
// Check implicitly captured variables.
if (!S->hasAssociatedStmt() || !S->getAssociatedStmt())
// Try to capture inner this->member references to generate correct mappings
// and diagnostics.
if (TryCaptureCXXThisMembers ||
(isOpenMPTargetExecutionDirective(Stack->getCurrentDirective()) &&
[](const CapturedStmt::Capture &C) {
return C.capturesThis();
}))) {
bool SavedTryCaptureCXXThisMembers = TryCaptureCXXThisMembers;
TryCaptureCXXThisMembers = true;
TryCaptureCXXThisMembers = SavedTryCaptureCXXThisMembers;
// In tasks firstprivates are not captured anymore, need to analyze them
// explicitly.
if (isOpenMPTaskingDirective(S->getDirectiveKind()) &&
!isOpenMPTaskLoopDirective(S->getDirectiveKind())) {
for (OMPClause *C : S->clauses())
if (auto *FC = dyn_cast<OMPFirstprivateClause>(C)) {
for (Expr *Ref : FC->varlists())
void VisitDeclRefExpr(DeclRefExpr *E) {
if (TryCaptureCXXThisMembers || E->isTypeDependent() ||
E->isValueDependent() || E->containsUnexpandedParameterPack() ||
if (auto *VD = dyn_cast<VarDecl>(E->getDecl())) {
// Check the datasharing rules for the expressions in the clauses.
if (!CS) {
if (auto *CED = dyn_cast<OMPCapturedExprDecl>(VD))
if (!CED->hasAttr<OMPCaptureNoInitAttr>()) {
} else if (VD->isImplicit() || isa<OMPCapturedExprDecl>(VD))
// Do not analyze internal variables and do not enclose them into
// implicit clauses.
VD = VD->getCanonicalDecl();
// Skip internally declared variables.
if (VD->hasLocalStorage() && CS && !CS->capturesVariable(VD) &&
// Skip allocators in uses_allocators clauses.
if (Stack->isUsesAllocatorsDecl(VD).hasValue())
DSAStackTy::DSAVarData DVar = Stack->getTopDSA(VD, /*FromParent=*/false);
// Check if the variable has explicit DSA set and stop analysis if it so.
if (DVar.RefExpr || !ImplicitDeclarations.insert(VD).second)
// Skip internally declared static variables.
llvm::Optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
if (VD->hasGlobalStorage() && CS && !CS->capturesVariable(VD) &&
(Stack->hasRequiresDeclWithClause<OMPUnifiedSharedMemoryClause>() ||
!Res || *Res != OMPDeclareTargetDeclAttr::MT_Link) &&
SourceLocation ELoc = E->getExprLoc();
OpenMPDirectiveKind DKind = Stack->getCurrentDirective();
// The default(none) clause requires that each variable that is referenced
// in the construct, and does not have a predetermined data-sharing
// attribute, must have its data-sharing attribute explicitly determined
// by being listed in a data-sharing attribute clause.
if (DVar.CKind == OMPC_unknown &&
(Stack->getDefaultDSA() == DSA_none ||
Stack->getDefaultDSA() == DSA_firstprivate) &&
isImplicitOrExplicitTaskingRegion(DKind) &&
VarsWithInheritedDSA.count(VD) == 0) {
bool InheritedDSA = Stack->getDefaultDSA() == DSA_none;
if (!InheritedDSA && Stack->getDefaultDSA() == DSA_firstprivate) {
DSAStackTy::DSAVarData DVar =
Stack->getImplicitDSA(VD, /*FromParent=*/false);
InheritedDSA = DVar.CKind == OMPC_unknown;
if (InheritedDSA)
VarsWithInheritedDSA[VD] = E;
// OpenMP 5.0 [, defaultmap clause, Description]
// If implicit-behavior is none, each variable referenced in the
// construct that does not have a predetermined data-sharing attribute
// and does not appear in a to or link clause on a declare target
// directive must be listed in a data-mapping attribute clause, a
// data-haring attribute clause (including a data-sharing attribute
// clause on a combined construct where target. is one of the
// constituent constructs), or an is_device_ptr clause.
OpenMPDefaultmapClauseKind ClauseKind =
getVariableCategoryFromDecl(SemaRef.getLangOpts(), VD);
if (SemaRef.getLangOpts().OpenMP >= 50) {
bool IsModifierNone = Stack->getDefaultmapModifier(ClauseKind) ==
if (DVar.CKind == OMPC_unknown && IsModifierNone &&
VarsWithInheritedDSA.count(VD) == 0 && !Res) {
// Only check for data-mapping attribute and is_device_ptr here
// since we have already make sure that the declaration does not
// have a data-sharing attribute above
if (!Stack->checkMappableExprComponentListsForDecl(
VD, /*CurrentRegionOnly=*/true,
OpenMPClauseKind) {
auto MI = MapExprComponents.rbegin();
auto ME = MapExprComponents.rend();
return MI != ME && MI->getAssociatedDeclaration() == VD;
})) {
VarsWithInheritedDSA[VD] = E;
if (isOpenMPTargetExecutionDirective(DKind) &&
!Stack->isLoopControlVariable(VD).first) {
if (!Stack->checkMappableExprComponentListsForDecl(
VD, /*CurrentRegionOnly=*/true,
OpenMPClauseKind) {
// Variable is used if it has been marked as an array, array
// section, array shaping or the variable iself.
return StackComponents.size() == 1 ||
[](const OMPClauseMappableExprCommon::
MappableComponent &MC) {
return MC.getAssociatedDeclaration() ==
nullptr &&
MC.getAssociatedExpression()) ||
MC.getAssociatedExpression()) ||
})) {
bool IsFirstprivate = false;
// By default lambdas are captured as firstprivates.
if (const auto *RD =
IsFirstprivate = RD->isLambda();
IsFirstprivate =
IsFirstprivate || (Stack->mustBeFirstprivate(ClauseKind) && !Res);
if (IsFirstprivate) {
} else {
OpenMPDefaultmapClauseModifier M =
OpenMPMapClauseKind Kind = getMapClauseKindFromModifier(
M, ClauseKind == OMPC_DEFAULTMAP_aggregate || Res);
// OpenMP [, Restrictions, p.2]
// A list item that appears in a reduction clause of the innermost
// enclosing worksharing or parallel construct may not be accessed in an
// explicit task.
DVar = Stack->hasInnermostDSA(
VD, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
[](OpenMPDirectiveKind K) {
return isOpenMPParallelDirective(K) ||
isOpenMPWorksharingDirective(K) || isOpenMPTeamsDirective(K);
if (isOpenMPTaskingDirective(DKind) && DVar.CKind == OMPC_reduction) {
ErrorFound = true;
SemaRef.Diag(ELoc, diag::err_omp_reduction_in_task);
reportOriginalDsa(SemaRef, Stack, VD, DVar);
// Define implicit data-sharing attributes for task.
DVar = Stack->getImplicitDSA(VD, /*FromParent=*/false);
if (((isOpenMPTaskingDirective(DKind) && DVar.CKind != OMPC_shared) ||
(Stack->getDefaultDSA() == DSA_firstprivate &&
DVar.CKind == OMPC_firstprivate && !DVar.RefExpr)) &&
!Stack->isLoopControlVariable(VD).first) {
// Store implicitly used globals with declare target link for parent
// target.
if (!isOpenMPTargetExecutionDirective(DKind) && Res &&
*Res == OMPDeclareTargetDeclAttr::MT_Link) {
void VisitMemberExpr(MemberExpr *E) {
if (E->isTypeDependent() || E->isValueDependent() ||
E->containsUnexpandedParameterPack() || E->isInstantiationDependent())
auto *FD = dyn_cast<FieldDecl>(E->getMemberDecl());
OpenMPDirectiveKind DKind = Stack->getCurrentDirective();
if (auto *TE = dyn_cast<CXXThisExpr>(E->getBase()->IgnoreParenCasts())) {
if (!FD)
DSAStackTy::DSAVarData DVar = Stack->getTopDSA(FD, /*FromParent=*/false);
// Check if the variable has explicit DSA set and stop analysis if it
// so.
if (DVar.RefExpr || !ImplicitDeclarations.insert(FD).second)
if (isOpenMPTargetExecutionDirective(DKind) &&
!Stack->isLoopControlVariable(FD).first &&
FD, /*CurrentRegionOnly=*/true,
OpenMPClauseKind) {
return isa<CXXThisExpr>(
})) {
// OpenMP 4.5 [, map Clause, Restrictions, C/C++, p.3]
// A bit-field cannot appear in a map clause.
if (FD->isBitField())
// Check to see if the member expression is referencing a class that
// has already been explicitly mapped
if (Stack->isClassPreviouslyMapped(TE->getType()))
OpenMPDefaultmapClauseModifier Modifier =
OpenMPMapClauseKind Kind = getMapClauseKindFromModifier(
Modifier, /*IsAggregateOrDeclareTarget*/ true);
SourceLocation ELoc = E->getExprLoc();
// OpenMP [, Restrictions, p.2]
// A list item that appears in a reduction clause of the innermost
// enclosing worksharing or parallel construct may not be accessed in
// an explicit task.
DVar = Stack->hasInnermostDSA(
FD, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
[](OpenMPDirectiveKind K) {
return isOpenMPParallelDirective(K) ||
isOpenMPWorksharingDirective(K) || isOpenMPTeamsDirective(K);
if (isOpenMPTaskingDirective(DKind) && DVar.CKind == OMPC_reduction) {
ErrorFound = true;
SemaRef.Diag(ELoc, diag::err_omp_reduction_in_task);
reportOriginalDsa(SemaRef, Stack, FD, DVar);
// Define implicit data-sharing attributes for task.
DVar = Stack->getImplicitDSA(FD, /*FromParent=*/false);
if (isOpenMPTaskingDirective(DKind) && DVar.CKind != OMPC_shared &&
!Stack->isLoopControlVariable(FD).first) {
// Check if there is a captured expression for the current field in the
// region. Do not mark it as firstprivate unless there is no captured
// expression.
// TODO: try to make it firstprivate.
if (DVar.CKind != OMPC_unknown)
if (isOpenMPTargetExecutionDirective(DKind)) {
OMPClauseMappableExprCommon::MappableExprComponentList CurComponents;
if (!checkMapClauseExpressionBase(SemaRef, E, CurComponents, OMPC_map,
const auto *VD = cast<ValueDecl>(
if (!Stack->checkMappableExprComponentListsForDecl(
VD, /*CurrentRegionOnly=*/true,
OpenMPClauseKind) {
auto CCI = CurComponents.rbegin();
auto CCE = CurComponents.rend();
for (const auto &SC : llvm::reverse(StackComponents)) {
// Do both expressions have the same kind?
if (CCI->getAssociatedExpression()->getStmtClass() !=
if (!((isa<OMPArraySectionExpr>(
SC.getAssociatedExpression()) ||
SC.getAssociatedExpression())) &&
return false;
const Decl *CCD = CCI->getAssociatedDeclaration();
const Decl *SCD = SC.getAssociatedDeclaration();
CCD = CCD ? CCD->getCanonicalDecl() : nullptr;
SCD = SCD ? SCD->getCanonicalDecl() : nullptr;
if (SCD != CCD)
return false;
std::advance(CCI, 1);
if (CCI == CCE)
return true;
})) {
} else if (!TryCaptureCXXThisMembers) {
void VisitOMPExecutableDirective(OMPExecutableDirective *S) {
for (OMPClause *C : S->clauses()) {
// Skip analysis of arguments of implicitly defined firstprivate clause
// for task|target directives.
// Skip analysis of arguments of implicitly defined map clause for target
// directives.
if (C && !((isa<OMPFirstprivateClause>(C) || isa<OMPMapClause>(C)) &&
C->isImplicit())) {
for (Stmt *CC : C->children()) {
if (CC)
// Check implicitly captured variables.
void VisitStmt(Stmt *S) {
for (Stmt *C : S->children()) {
if (C) {
// Check implicitly captured variables in the task-based directives to
// check if they must be firstprivatized.
void visitSubCaptures(CapturedStmt *S) {
for (const CapturedStmt::Capture &Cap : S->captures()) {
if (!Cap.capturesVariable() && !Cap.capturesVariableByCopy())
VarDecl *VD = Cap.getCapturedVar();
// Do not try to map the variable if it or its sub-component was mapped
// already.
if (isOpenMPTargetExecutionDirective(Stack->getCurrentDirective()) &&
VD, /*CurrentRegionOnly=*/true,
OpenMPClauseKind) { return true; }))
DeclRefExpr *DRE = buildDeclRefExpr(
SemaRef, VD, VD->getType().getNonLValueExprType(SemaRef.Context),
Cap.getLocation(), /*RefersToCapture=*/true);
bool isErrorFound() const { return ErrorFound; }
ArrayRef<Expr *> getImplicitFirstprivate() const {
return ImplicitFirstprivate;
ArrayRef<Expr *> getImplicitMap(OpenMPDefaultmapClauseKind Kind) const {
return ImplicitMap[Kind];
const Sema::VarsWithInheritedDSAType &getVarsWithInheritedDSA() const {
return VarsWithInheritedDSA;
DSAAttrChecker(DSAStackTy *S, Sema &SemaRef, CapturedStmt *CS)
: Stack(S), SemaRef(SemaRef), ErrorFound(false), CS(CS) {
// Process declare target link variables for the target directives.
if (isOpenMPTargetExecutionDirective(S->getCurrentDirective())) {
for (DeclRefExpr *E : Stack->getLinkGlobals())
} // namespace
void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
switch (DKind) {
case OMPD_parallel:
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_parallel_sections:
case OMPD_parallel_master:
case OMPD_teams:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
QualType KmpInt32PtrTy =
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
case OMPD_target_teams:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
QualType KmpInt32PtrTy =
QualType Args[] = {VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
EPI.Variadic = true;
QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32Ty),
std::make_pair(".part_id.", KmpInt32PtrTy),
std::make_pair(".privates.", VoidPtrTy),
std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
Params, /*OpenMPCaptureLevel=*/0);
// Mark this captured region as inlined, because we don't use outlined
// function directly.
Context, {}, AttributeCommonInfo::AS_Keyword,
Sema::CapturedParamNameType ParamsTarget[] = {
std::make_pair(StringRef(), QualType()) // __context with shared vars
// Start a captured region for 'target' with no implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsTarget, /*OpenMPCaptureLevel=*/1);
Sema::CapturedParamNameType ParamsTeamsOrParallel[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
std::make_pair(StringRef(), QualType()) // __context with shared vars
// Start a captured region for 'teams' or 'parallel'. Both regions have
// the same implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsTeamsOrParallel, /*OpenMPCaptureLevel=*/2);
case OMPD_target:
case OMPD_target_simd: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
QualType KmpInt32PtrTy =
QualType Args[] = {VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
EPI.Variadic = true;
QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32Ty),
std::make_pair(".part_id.", KmpInt32PtrTy),
std::make_pair(".privates.", VoidPtrTy),
std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
Params, /*OpenMPCaptureLevel=*/0);
// Mark this captured region as inlined, because we don't use outlined
// function directly.
Context, {}, AttributeCommonInfo::AS_Keyword,
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
std::make_pair(StringRef(), QualType()),
case OMPD_simd:
case OMPD_for:
case OMPD_for_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_target_data: {
Sema::CapturedParamNameType Params[] = {
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
case OMPD_task: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
QualType KmpInt32PtrTy =
QualType Args[] = {VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
EPI.Variadic = true;
QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32Ty),
std::make_pair(".part_id.", KmpInt32PtrTy),
std::make_pair(".privates.", VoidPtrTy),
std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
// Mark this captured region as inlined, because we don't use outlined
// function directly.
Context, {}, AttributeCommonInfo::AS_Keyword,
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd: {
QualType KmpInt32Ty =
Context.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1)
QualType KmpUInt64Ty =
Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0)
QualType KmpInt64Ty =
Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1)
QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
QualType KmpInt32PtrTy =
QualType Args[] = {VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
EPI.Variadic = true;
QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32Ty),
std::make_pair(".part_id.", KmpInt32PtrTy),
std::make_pair(".privates.", VoidPtrTy),
std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
std::make_pair(".lb.", KmpUInt64Ty),
std::make_pair(".ub.", KmpUInt64Ty),
std::make_pair(".st.", KmpInt64Ty),
std::make_pair(".liter.", KmpInt32Ty),
std::make_pair(".reductions.", VoidPtrTy),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
// Mark this captured region as inlined, because we don't use outlined
// function directly.
Context, {}, AttributeCommonInfo::AS_Keyword,
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd: {
QualType KmpInt32Ty =
Context.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1)
QualType KmpUInt64Ty =
Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0)
QualType KmpInt64Ty =
Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1)
QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
QualType KmpInt32PtrTy =
Sema::CapturedParamNameType ParamsParallel[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
std::make_pair(StringRef(), QualType()) // __context with shared vars
// Start a captured region for 'parallel'.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsParallel, /*OpenMPCaptureLevel=*/0);
QualType Args[] = {VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
EPI.Variadic = true;
QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32Ty),
std::make_pair(".part_id.", KmpInt32PtrTy),
std::make_pair(".privates.", VoidPtrTy),
std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
std::make_pair(".lb.", KmpUInt64Ty),
std::make_pair(".ub.", KmpUInt64Ty),
std::make_pair(".st.", KmpInt64Ty),
std::make_pair(".liter.", KmpInt32Ty),
std::make_pair(".reductions.", VoidPtrTy),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
Params, /*OpenMPCaptureLevel=*/1);
// Mark this captured region as inlined, because we don't use outlined
// function directly.
Context, {}, AttributeCommonInfo::AS_Keyword,
case OMPD_distribute_parallel_for_simd:
case OMPD_distribute_parallel_for: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
QualType KmpInt32PtrTy =
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
std::make_pair("", Context.getSizeType().withConst()),
std::make_pair(".previous.ub.", Context.getSizeType().withConst()),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
QualType KmpInt32PtrTy =
QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
QualType Args[] = {VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
EPI.Variadic = true;
QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32Ty),
std::make_pair(".part_id.", KmpInt32PtrTy),
std::make_pair(".privates.", VoidPtrTy),
std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
Params, /*OpenMPCaptureLevel=*/0);
// Mark this captured region as inlined, because we don't use outlined
// function directly.
Context, {}, AttributeCommonInfo::AS_Keyword,
Sema::CapturedParamNameType ParamsTarget[] = {
std::make_pair(StringRef(), QualType()) // __context with shared vars
// Start a captured region for 'target' with no implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsTarget, /*OpenMPCaptureLevel=*/1);
Sema::CapturedParamNameType ParamsTeams[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
std::make_pair(StringRef(), QualType()) // __context with shared vars
// Start a captured region for 'target' with no implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsTeams, /*OpenMPCaptureLevel=*/2);
Sema::CapturedParamNameType ParamsParallel[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
std::make_pair("", Context.getSizeType().withConst()),
std::make_pair(".previous.ub.", Context.getSizeType().withConst()),
std::make_pair(StringRef(), QualType()) // __context with shared vars
// Start a captured region for 'teams' or 'parallel'. Both regions have
// the same implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsParallel, /*OpenMPCaptureLevel=*/3);
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
QualType KmpInt32PtrTy =
Sema::CapturedParamNameType ParamsTeams[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
std::make_pair(StringRef(), QualType()) // __context with shared vars
// Start a captured region for 'target' with no implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsTeams, /*OpenMPCaptureLevel=*/0);
Sema::CapturedParamNameType ParamsParallel[] = {
std::make_pair(".global_tid.", KmpInt32PtrTy),
std::make_pair(".bound_tid.", KmpInt32PtrTy),
std::make_pair("", Context.getSizeType().withConst()),
std::make_pair(".previous.ub.", Context.getSizeType().withConst()),
std::make_pair(StringRef(), QualType()) // __context with shared vars
// Start a captured region for 'teams' or 'parallel'. Both regions have
// the same implicit parameters.
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
ParamsParallel, /*OpenMPCaptureLevel=*/1);
case OMPD_target_update:
case OMPD_target_enter_data:
case OMPD_target_exit_data: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
QualType KmpInt32PtrTy =
QualType Args[] = {VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
EPI.Variadic = true;
QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
Sema::CapturedParamNameType Params[] = {
std::make_pair(".global_tid.", KmpInt32Ty),
std::make_pair(".part_id.", KmpInt32PtrTy),
std::make_pair(".privates.", VoidPtrTy),
std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
std::make_pair(StringRef(), QualType()) // __context with shared vars
ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
// Mark this captured region as inlined, because we don't use outlined
// function directly.
Context, {}, AttributeCommonInfo::AS_Keyword,
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_cancel:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_requires:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
llvm_unreachable("OpenMP Directive is not allowed");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
int Sema::getNumberOfConstructScopes(unsigned Level) const {
return getOpenMPCaptureLevels(DSAStack->getDirective(Level));
int Sema::getOpenMPCaptureLevels(OpenMPDirectiveKind DKind) {
SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, DKind);
return CaptureRegions.size();
static OMPCapturedExprDecl *buildCaptureDecl(Sema &S, IdentifierInfo *Id,
Expr *CaptureExpr, bool WithInit,
bool AsExpression) {
ASTContext &C = S.getASTContext();
Expr *Init = AsExpression ? CaptureExpr : CaptureExpr->IgnoreImpCasts();
QualType Ty = Init->getType();
if (CaptureExpr->getObjectKind() == OK_Ordinary && CaptureExpr->isGLValue()) {
if (S.getLangOpts().CPlusPlus) {
Ty = C.getLValueReferenceType(Ty);
} else {
Ty = C.getPointerType(Ty);
ExprResult Res =
S.CreateBuiltinUnaryOp(CaptureExpr->getExprLoc(), UO_AddrOf, Init);
if (!Res.isUsable())
return nullptr;
Init = Res.get();
WithInit = true;
auto *CED = OMPCapturedExprDecl::Create(C, S.CurContext, Id, Ty,
if (!WithInit)
S.AddInitializerToDecl(CED, Init, /*DirectInit=*/false);
return CED;
static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr,
bool WithInit) {
OMPCapturedExprDecl *CD;
if (VarDecl *VD = S.isOpenMPCapturedDecl(D))
CD = cast<OMPCapturedExprDecl>(VD);
CD = buildCaptureDecl(S, D->getIdentifier(), CaptureExpr, WithInit,
return buildDeclRefExpr(S, CD, CD->getType().getNonReferenceType(),
static ExprResult buildCapture(Sema &S, Expr *CaptureExpr, DeclRefExpr *&Ref) {
CaptureExpr = S.DefaultLvalueConversion(CaptureExpr).get();
if (!Ref) {
OMPCapturedExprDecl *CD = buildCaptureDecl(
S, &S.getASTContext().Idents.get(".capture_expr."), CaptureExpr,
/*WithInit=*/true, /*AsExpression=*/true);
Ref = buildDeclRefExpr(S, CD, CD->getType().getNonReferenceType(),
ExprResult Res = Ref;
if (!S.getLangOpts().CPlusPlus &&
CaptureExpr->getObjectKind() == OK_Ordinary && CaptureExpr->isGLValue() &&
Ref->getType()->isPointerType()) {
Res = S.CreateBuiltinUnaryOp(CaptureExpr->getExprLoc(), UO_Deref, Ref);
if (!Res.isUsable())
return ExprError();
return S.DefaultLvalueConversion(Res.get());
namespace {
// OpenMP directives parsed in this section are represented as a
// CapturedStatement with an associated statement. If a syntax error
// is detected during the parsing of the associated statement, the
// compiler must abort processing and close the CapturedStatement.
// Combined directives such as 'target parallel' have more than one
// nested CapturedStatements. This RAII ensures that we unwind out
// of all the nested CapturedStatements when an error is found.
class CaptureRegionUnwinderRAII {
Sema &S;
bool &ErrorFound;
OpenMPDirectiveKind DKind = OMPD_unknown;
CaptureRegionUnwinderRAII(Sema &S, bool &ErrorFound,
OpenMPDirectiveKind DKind)
: S(S), ErrorFound(ErrorFound), DKind(DKind) {}
~CaptureRegionUnwinderRAII() {
if (ErrorFound) {
int ThisCaptureLevel = S.getOpenMPCaptureLevels(DKind);
while (--ThisCaptureLevel >= 0)
} // namespace
void Sema::tryCaptureOpenMPLambdas(ValueDecl *V) {
// Capture variables captured by reference in lambdas for target-based
// directives.
if (!CurContext->isDependentContext() &&
(isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) ||
DSAStack->getCurrentDirective()))) {
QualType Type = V->getType();
if (const auto *RD = Type.getCanonicalType()
->getAsCXXRecordDecl()) {
bool SavedForceCaptureByReferenceInTargetExecutable =
if (RD->isLambda()) {
llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
FieldDecl *ThisCapture;
RD->getCaptureFields(Captures, ThisCapture);
for (const LambdaCapture &LC : RD->captures()) {
if (LC.getCaptureKind() == LCK_ByRef) {
VarDecl *VD = LC.getCapturedVar();
DeclContext *VDC = VD->getDeclContext();
if (!VDC->Encloses(CurContext))
MarkVariableReferenced(LC.getLocation(), VD);
} else if (LC.getCaptureKind() == LCK_This) {
QualType ThisTy = getCurrentThisType();
if (!ThisTy.isNull() &&
Context.typesAreCompatible(ThisTy, ThisCapture->getType()))
static bool checkOrderedOrderSpecified(Sema &S,
const ArrayRef<OMPClause *> Clauses) {
const OMPOrderedClause *Ordered = nullptr;
const OMPOrderClause *Order = nullptr;
for (const OMPClause *Clause : Clauses) {
if (Clause->getClauseKind() == OMPC_ordered)
Ordered = cast<OMPOrderedClause>(Clause);
else if (Clause->getClauseKind() == OMPC_order) {
Order = cast<OMPOrderClause>(Clause);
if (Order->getKind() != OMPC_ORDER_concurrent)
Order = nullptr;
if (Ordered && Order)
if (Ordered && Order) {
<< getOpenMPClauseName(OMPC_order)
<< getOpenMPSimpleClauseTypeName(OMPC_order, OMPC_ORDER_concurrent)
<< SourceRange(Order->getBeginLoc(), Order->getEndLoc());
S.Diag(Ordered->getBeginLoc(), diag::note_omp_ordered_param)
<< 0 << SourceRange(Ordered->getBeginLoc(), Ordered->getEndLoc());
return true;
return false;
StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
ArrayRef<OMPClause *> Clauses) {
bool ErrorFound = false;
CaptureRegionUnwinderRAII CaptureRegionUnwinder(
*this, ErrorFound, DSAStack->getCurrentDirective());
if (!S.isUsable()) {
ErrorFound = true;
return StmtError();
SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, DSAStack->getCurrentDirective());
OMPOrderedClause *OC = nullptr;
OMPScheduleClause *SC = nullptr;
SmallVector<const OMPLinearClause *, 4> LCs;
SmallVector<const OMPClauseWithPreInit *, 4> PICs;
// This is required for proper codegen.
for (OMPClause *Clause : Clauses) {
if (!LangOpts.OpenMPSimd &&
isOpenMPTaskingDirective(DSAStack->getCurrentDirective()) &&
Clause->getClauseKind() == OMPC_in_reduction) {
// Capture taskgroup task_reduction descriptors inside the tasking regions
// with the corresponding in_reduction items.
auto *IRC = cast<OMPInReductionClause>(Clause);
for (Expr *E : IRC->taskgroup_descriptors())
if (E)
if (isOpenMPPrivate(Clause->getClauseKind()) ||
Clause->getClauseKind() == OMPC_copyprivate ||
(getLangOpts().OpenMPUseTLS &&
getASTContext().getTargetInfo().isTLSSupported() &&
Clause->getClauseKind() == OMPC_copyin)) {
DSAStack->setForceVarCapturing(Clause->getClauseKind() == OMPC_copyin);
// Mark all variables in private list clauses as used in inner region.
for (Stmt *VarRef : Clause->children()) {
if (auto *E = cast_or_null<Expr>(VarRef)) {
} else if (CaptureRegions.size() > 1 ||
CaptureRegions.back() != OMPD_unknown) {
if (auto *C = OMPClauseWithPreInit::get(Clause))
if (auto *C = OMPClauseWithPostUpdate::get(Clause)) {
if (Expr *E = C->getPostUpdateExpr())
if (Clause->getClauseKind() == OMPC_schedule)
SC = cast<OMPScheduleClause>(Clause);
else if (Clause->getClauseKind() == OMPC_ordered)
OC = cast<OMPOrderedClause>(Clause);
else if (Clause->getClauseKind() == OMPC_linear)
// Capture allocator expressions if used.
for (Expr *E : DSAStack->getInnerAllocators())
// OpenMP, 2.7.1 Loop Construct, Restrictions
// The nonmonotonic modifier cannot be specified if an ordered clause is
// specified.
if (SC &&
(SC->getFirstScheduleModifier() == OMPC_SCHEDULE_MODIFIER_nonmonotonic ||
SC->getSecondScheduleModifier() ==
OC) {
Diag(SC->getFirstScheduleModifier() == OMPC_SCHEDULE_MODIFIER_nonmonotonic
? SC->getFirstScheduleModifierLoc()
: SC->getSecondScheduleModifierLoc(),
<< getOpenMPClauseName(OMPC_schedule)
<< getOpenMPSimpleClauseTypeName(OMPC_schedule,
<< SourceRange(OC->getBeginLoc(), OC->getEndLoc());
ErrorFound = true;
// OpenMP 5.0, 2.9.2 Worksharing-Loop Construct, Restrictions.
// If an order(concurrent) clause is present, an ordered clause may not appear
// on the same directive.
if (checkOrderedOrderSpecified(*this, Clauses))
ErrorFound = true;
if (!LCs.empty() && OC && OC->getNumForLoops()) {
for (const OMPLinearClause *C : LCs) {
Diag(C->getBeginLoc(), diag::err_omp_linear_ordered)
<< SourceRange(OC->getBeginLoc(), OC->getEndLoc());
ErrorFound = true;
if (isOpenMPWorksharingDirective(DSAStack->getCurrentDirective()) &&
isOpenMPSimdDirective(DSAStack->getCurrentDirective()) && OC &&
OC->getNumForLoops()) {
Diag(OC->getBeginLoc(), diag::err_omp_ordered_simd)
<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
ErrorFound = true;
if (ErrorFound) {
return StmtError();
StmtResult SR = S;
unsigned CompletedRegions = 0;
for (OpenMPDirectiveKind ThisCaptureRegion : llvm::reverse(CaptureRegions)) {
// Mark all variables in private list clauses as used in inner region.
// Required for proper codegen of combined directives.
// TODO: add processing for other clauses.
if (ThisCaptureRegion != OMPD_unknown) {
for (const clang::OMPClauseWithPreInit *C : PICs) {
OpenMPDirectiveKind CaptureRegion = C->getCaptureRegion();
// Find the particular capture region for the clause if the
// directive is a combined one with multiple capture regions.
// If the directive is not a combined one, the capture region
// associated with the clause is OMPD_unknown and is generated
// only once.
if (CaptureRegion == ThisCaptureRegion ||
CaptureRegion == OMPD_unknown) {
if (auto *DS = cast_or_null<DeclStmt>(C->getPreInitStmt())) {
for (Decl *D : DS->decls())
MarkVariableReferenced(D->getLocation(), cast<VarDecl>(D));
if (ThisCaptureRegion == OMPD_target) {
// Capture allocator traits in the target region. They are used implicitly
// and, thus, are not captured by default.
for (OMPClause *C : Clauses) {
if (const auto *UAC = dyn_cast<OMPUsesAllocatorsClause>(C)) {
for (unsigned I = 0, End = UAC->getNumberOfAllocators(); I < End;
++I) {
OMPUsesAllocatorsClause::Data D = UAC->getAllocatorData(I);
if (Expr *E = D.AllocatorTraits)
if (++CompletedRegions == CaptureRegions.size())
SR = ActOnCapturedRegionEnd(SR.get());
return SR;
static bool checkCancelRegion(Sema &SemaRef, OpenMPDirectiveKind CurrentRegion,
OpenMPDirectiveKind CancelRegion,
SourceLocation StartLoc) {
// CancelRegion is only needed for cancel and cancellation_point.
if (CurrentRegion != OMPD_cancel && CurrentRegion != OMPD_cancellation_point)
return false;
if (CancelRegion == OMPD_parallel || CancelRegion == OMPD_for ||
CancelRegion == OMPD_sections || CancelRegion == OMPD_taskgroup)
return false;
SemaRef.Diag(StartLoc, diag::err_omp_wrong_cancel_region)
<< getOpenMPDirectiveName(CancelRegion);
return true;
static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack,
OpenMPDirectiveKind CurrentRegion,
const DeclarationNameInfo &CurrentName,
OpenMPDirectiveKind CancelRegion,
SourceLocation StartLoc) {
if (Stack->getCurScope()) {
OpenMPDirectiveKind ParentRegion = Stack->getParentDirective();
OpenMPDirectiveKind OffendingRegion = ParentRegion;
bool NestingProhibited = false;
bool CloseNesting = true;
bool OrphanSeen = false;
enum {
} Recommend = NoRecommend;
if (isOpenMPSimdDirective(ParentRegion) &&
((SemaRef.LangOpts.OpenMP <= 45 && CurrentRegion != OMPD_ordered) ||
(SemaRef.LangOpts.OpenMP >= 50 && CurrentRegion != OMPD_ordered &&
CurrentRegion != OMPD_simd && CurrentRegion != OMPD_atomic &&
CurrentRegion != OMPD_scan))) {
// OpenMP [2.16, Nesting of Regions]
// OpenMP constructs may not be nested inside a simd region.
// OpenMP [2.8.1,simd Construct, Restrictions]
// An ordered construct with the simd clause is the only OpenMP
// construct that can appear in the simd region.
// Allowing a SIMD construct nested in another SIMD construct is an
// extension. The OpenMP 4.5 spec does not allow it. Issue a warning
// message.
// OpenMP 5.0 [, simd Construct, Restrictions]
// The only OpenMP constructs that can be encountered during execution of
// a simd region are the atomic construct, the loop construct, the simd
// construct and the ordered construct with the simd clause.
SemaRef.Diag(StartLoc, (CurrentRegion != OMPD_simd)
? diag::err_omp_prohibited_region_simd
: diag::warn_omp_nesting_simd)
<< (SemaRef.LangOpts.OpenMP >= 50 ? 1 : 0);
return CurrentRegion != OMPD_simd;
if (ParentRegion == OMPD_atomic) {
// OpenMP [2.16, Nesting of Regions]
// OpenMP constructs may not be nested inside an atomic region.
SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region_atomic);
return true;
if (CurrentRegion == OMPD_section) {
// OpenMP [2.7.2, sections Construct, Restrictions]
// Orphaned section directives are prohibited. That is, the section
// directives must appear within the sections construct and must not be
// encountered elsewhere in the sections region.
if (ParentRegion != OMPD_sections &&
ParentRegion != OMPD_parallel_sections) {
SemaRef.Diag(StartLoc, diag::err_omp_orphaned_section_directive)
<< (ParentRegion != OMPD_unknown)
<< getOpenMPDirectiveName(ParentRegion);
return true;
return false;
// Allow some constructs (except teams and cancellation constructs) to be
// orphaned (they could be used in functions, called from OpenMP regions
// with the required preconditions).
if (ParentRegion == OMPD_unknown &&
!isOpenMPNestingTeamsDirective(CurrentRegion) &&
CurrentRegion != OMPD_cancellation_point &&
CurrentRegion != OMPD_cancel && CurrentRegion != OMPD_scan)
return false;
if (CurrentRegion == OMPD_cancellation_point ||
CurrentRegion == OMPD_cancel) {
// OpenMP [2.16, Nesting of Regions]
// A cancellation point construct for which construct-type-clause is
// taskgroup must be nested inside a task construct. A cancellation
// point construct for which construct-type-clause is not taskgroup must
// be closely nested inside an OpenMP construct that matches the type
// specified in construct-type-clause.
// A cancel construct for which construct-type-clause is taskgroup must be
// nested inside a task construct. A cancel construct for which
// construct-type-clause is not taskgroup must be closely nested inside an
// OpenMP construct that matches the type specified in
// construct-type-clause.
NestingProhibited =
!((CancelRegion == OMPD_parallel &&
(ParentRegion == OMPD_parallel ||
ParentRegion == OMPD_target_parallel)) ||
(CancelRegion == OMPD_for &&
(ParentRegion == OMPD_for || ParentRegion == OMPD_parallel_for ||
ParentRegion == OMPD_target_parallel_for ||
ParentRegion == OMPD_distribute_parallel_for ||
ParentRegion == OMPD_teams_distribute_parallel_for ||
ParentRegion == OMPD_target_teams_distribute_parallel_for)) ||
(CancelRegion == OMPD_taskgroup &&
(ParentRegion == OMPD_task ||
(SemaRef.getLangOpts().OpenMP >= 50 &&
(ParentRegion == OMPD_taskloop ||
ParentRegion == OMPD_master_taskloop ||
ParentRegion == OMPD_parallel_master_taskloop)))) ||
(CancelRegion == OMPD_sections &&
(ParentRegion == OMPD_section || ParentRegion == OMPD_sections ||
ParentRegion == OMPD_parallel_sections)));
OrphanSeen = ParentRegion == OMPD_unknown;
} else if (CurrentRegion == OMPD_master) {
// OpenMP [2.16, Nesting of Regions]
// A master region may not be closely nested inside a worksharing,
// atomic, or explicit task region.
NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) ||
} else if (CurrentRegion == OMPD_critical && CurrentName.getName()) {
// OpenMP [2.16, Nesting of Regions]
// A critical region may not be nested (closely or otherwise) inside a
// critical region with the same name. Note that this restriction is not
// sufficient to prevent deadlock.
SourceLocation PreviousCriticalLoc;
bool DeadLock = Stack->hasDirective(
[CurrentName, &PreviousCriticalLoc](OpenMPDirectiveKind K,
const DeclarationNameInfo &DNI,
SourceLocation Loc) {
if (K == OMPD_critical && DNI.getName() == CurrentName.getName()) {
PreviousCriticalLoc = Loc;
return true;
return false;
false /* skip top directive */);
if (DeadLock) {
<< CurrentName.getName();
if (PreviousCriticalLoc.isValid())
return true;
} else if (CurrentRegion == OMPD_barrier) {
// OpenMP [2.16, Nesting of Regions]
// A barrier region may not be closely nested inside a worksharing,
// explicit task, critical, ordered, atomic, or master region.
NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) ||
isOpenMPTaskingDirective(ParentRegion) ||
ParentRegion == OMPD_master ||
ParentRegion == OMPD_parallel_master ||
ParentRegion == OMPD_critical ||
ParentRegion == OMPD_ordered;
} else if (isOpenMPWorksharingDirective(CurrentRegion) &&
!isOpenMPParallelDirective(CurrentRegion) &&
!isOpenMPTeamsDirective(CurrentRegion)) {
// OpenMP [2.16, Nesting of Regions]
// A worksharing region may not be closely nested inside a worksharing,
// explicit task, critical, ordered, atomic, or master region.
NestingProhibited = isOpenMPWorksharingDirective(ParentRegion) ||
isOpenMPTaskingDirective(ParentRegion) ||
ParentRegion == OMPD_master ||
ParentRegion == OMPD_parallel_master ||
ParentRegion == OMPD_critical ||
ParentRegion == OMPD_ordered;
Recommend = ShouldBeInParallelRegion;
} else if (CurrentRegion == OMPD_ordered) {
// OpenMP [2.16, Nesting of Regions]
// An ordered region may not be closely nested inside a critical,
// atomic, or explicit task region.
// An ordered region must be closely nested inside a loop region (or
// parallel loop region) with an ordered clause.
// OpenMP [2.8.1,simd Construct, Restrictions]
// An ordered construct with the simd clause is the only OpenMP construct
// that can appear in the simd region.
NestingProhibited = ParentRegion == OMPD_critical ||
isOpenMPTaskingDirective(ParentRegion) ||
!(isOpenMPSimdDirective(ParentRegion) ||
Recommend = ShouldBeInOrderedRegion;
} else if (isOpenMPNestingTeamsDirective(CurrentRegion)) {
// OpenMP [2.16, Nesting of Regions]
// If specified, a teams construct must be contained within a target
// construct.
NestingProhibited =
(SemaRef.LangOpts.OpenMP <= 45 && ParentRegion != OMPD_target) ||
(SemaRef.LangOpts.OpenMP >= 50 && ParentRegion != OMPD_unknown &&
ParentRegion != OMPD_target);
OrphanSeen = ParentRegion == OMPD_unknown;
Recommend = ShouldBeInTargetRegion;
} else if (CurrentRegion == OMPD_scan) {
// OpenMP [2.16, Nesting of Regions]
// If specified, a teams construct must be contained within a target
// construct.
NestingProhibited =
SemaRef.LangOpts.OpenMP < 50 ||
(ParentRegion != OMPD_simd && ParentRegion != OMPD_for &&
ParentRegion != OMPD_for_simd && ParentRegion != OMPD_parallel_for &&
ParentRegion != OMPD_parallel_for_simd);
OrphanSeen = ParentRegion == OMPD_unknown;
Recommend = ShouldBeInLoopSimdRegion;
if (!NestingProhibited &&
!isOpenMPTargetExecutionDirective(CurrentRegion) &&
!isOpenMPTargetDataManagementDirective(CurrentRegion) &&
(ParentRegion == OMPD_teams || ParentRegion == OMPD_target_teams)) {
// OpenMP [2.16, Nesting of Regions]
// distribute, parallel, parallel sections, parallel workshare, and the
// parallel loop and parallel loop SIMD constructs are the only OpenMP
// constructs that can be closely nested in the teams region.
NestingProhibited = !isOpenMPParallelDirective(CurrentRegion) &&
Recommend = ShouldBeInParallelRegion;
if (!NestingProhibited &&
isOpenMPNestingDistributeDirective(CurrentRegion)) {
// OpenMP 4.5 [2.17 Nesting of Regions]
// The region associated with the distribute construct must be strictly
// nested inside a teams region
NestingProhibited =
(ParentRegion != OMPD_teams && ParentRegion != OMPD_target_teams);
Recommend = ShouldBeInTeamsRegion;
if (!NestingProhibited &&
(isOpenMPTargetExecutionDirective(CurrentRegion) ||
isOpenMPTargetDataManagementDirective(CurrentRegion))) {
// OpenMP 4.5 [2.17 Nesting of Regions]
// If a target, target update, target data, target enter data, or
// target exit data construct is encountered during execution of a
// target region, the behavior is unspecified.
NestingProhibited = Stack->hasDirective(
[&OffendingRegion](OpenMPDirectiveKind K, const DeclarationNameInfo &,
SourceLocation) {
if (isOpenMPTargetExecutionDirective(K)) {
OffendingRegion = K;
return true;
return false;
false /* don't skip top directive */);
CloseNesting = false;
if (NestingProhibited) {
if (OrphanSeen) {
SemaRef.Diag(StartLoc, diag::err_omp_orphaned_device_directive)
<< getOpenMPDirectiveName(CurrentRegion) << Recommend;
} else {
SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region)
<< CloseNesting << getOpenMPDirectiveName(OffendingRegion)
<< Recommend << getOpenMPDirectiveName(CurrentRegion);
return true;
return false;
struct Kind2Unsigned {
using argument_type = OpenMPDirectiveKind;
unsigned operator()(argument_type DK) { return unsigned(DK); }
static bool checkIfClauses(Sema &S, OpenMPDirectiveKind Kind,
ArrayRef<OMPClause *> Clauses,
ArrayRef<OpenMPDirectiveKind> AllowedNameModifiers) {
bool ErrorFound = false;
unsigned NamedModifiersNumber = 0;
llvm::IndexedMap<const OMPIfClause *, Kind2Unsigned> FoundNameModifiers;
FoundNameModifiers.resize(llvm::omp::Directive_enumSize + 1);
SmallVector<SourceLocation, 4> NameModifierLoc;
for (const OMPClause *C : Clauses) {
if (const auto *IC = dyn_cast_or_null<OMPIfClause>(C)) {
// At most one if clause without a directive-name-modifier can appear on
// the directive.
OpenMPDirectiveKind CurNM = IC->getNameModifier();
if (FoundNameModifiers[CurNM]) {
S.Diag(C->getBeginLoc(), diag::err_omp_more_one_clause)
<< getOpenMPDirectiveName(Kind) << getOpenMPClauseName(OMPC_if)
<< (CurNM != OMPD_unknown) << getOpenMPDirectiveName(CurNM);
ErrorFound = true;
} else if (CurNM != OMPD_unknown) {
FoundNameModifiers[CurNM] = IC;
if (CurNM == OMPD_unknown)
// Check if the specified name modifier is allowed for the current
// directive.
// At most one if clause with the particular directive-name-modifier can
// appear on the directive.
bool MatchFound = false;
for (auto NM : AllowedNameModifiers) {
if (CurNM == NM) {
MatchFound = true;
if (!MatchFound) {
<< getOpenMPDirectiveName(CurNM) << getOpenMPDirectiveName(Kind);
ErrorFound = true;
// If any if clause on the directive includes a directive-name-modifier then
// all if clauses on the directive must include a directive-name-modifier.
if (FoundNameModifiers[OMPD_unknown] && NamedModifiersNumber > 0) {
if (NamedModifiersNumber == AllowedNameModifiers.size()) {
} else {
std::string Values;
std::string Sep(", ");
unsigned AllowedCnt = 0;
unsigned TotalAllowedNum =
AllowedNameModifiers.size() - NamedModifiersNumber;
for (unsigned Cnt = 0, End = AllowedNameModifiers.size(); Cnt < End;
++Cnt) {
OpenMPDirectiveKind NM = AllowedNameModifiers[Cnt];
if (!FoundNameModifiers[NM]) {
Values += "'";
Values += getOpenMPDirectiveName(NM);
Values += "'";
if (AllowedCnt + 2 == TotalAllowedNum)
Values += " or ";
else if (AllowedCnt + 1 != TotalAllowedNum)
Values += Sep;
<< (TotalAllowedNum > 1) << Values;
for (SourceLocation Loc : NameModifierLoc) {
S.Diag(Loc, diag::note_omp_previous_named_if_clause);
ErrorFound = true;
return ErrorFound;
static std::pair<ValueDecl *, bool> getPrivateItem(Sema &S, Expr *&RefExpr,
SourceLocation &ELoc,
SourceRange &ERange,
bool AllowArraySection) {
if (RefExpr->isTypeDependent() || RefExpr->isValueDependent() ||
return std::make_pair(nullptr, true);
// OpenMP [3.1, C/C++]
// A list item is a variable name.
// OpenMP [, Restrictions, p.1]
// A variable that is part of another variable (as an array or
// structure element) cannot appear in a private clause.
RefExpr = RefExpr->IgnoreParens();
enum {
NoArrayExpr = -1,
ArraySubscript = 0,
OMPArraySection = 1
} IsArrayExpr = NoArrayExpr;
if (AllowArraySection) {
if (auto *ASE = dyn_cast_or_null<ArraySubscriptExpr>(RefExpr)) {
Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = TempASE->getBase()->IgnoreParenImpCasts();
RefExpr = Base;
IsArrayExpr = ArraySubscript;
} else if (auto *OASE = dyn_cast_or_null<OMPArraySectionExpr>(RefExpr)) {
Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
Base = TempOASE->getBase()->IgnoreParenImpCasts();
while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = TempASE->getBase()->IgnoreParenImpCasts();
RefExpr = Base;
IsArrayExpr = OMPArraySection;
ELoc = RefExpr->getExprLoc();
ERange = RefExpr->getSourceRange();
RefExpr = RefExpr->IgnoreParenImpCasts();
auto *DE = dyn_cast_or_null<DeclRefExpr>(RefExpr);
auto *ME = dyn_cast_or_null<MemberExpr>(RefExpr);
if ((!DE || !isa<VarDecl>(DE->getDecl())) &&
(S.getCurrentThisType().isNull() || !ME ||
!isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()) ||
!isa<FieldDecl>(ME->getMemberDecl()))) {
if (IsArrayExpr != NoArrayExpr) {
S.Diag(ELoc, diag::err_omp_expected_base_var_name) << IsArrayExpr
<< ERange;
} else {
? diag::err_omp_expected_var_name_member_expr_or_array_item
: diag::err_omp_expected_var_name_member_expr)
<< (S.getCurrentThisType().isNull() ? 0 : 1) << ERange;
return std::make_pair(nullptr, false);
return std::make_pair(
getCanonicalDecl(DE ? DE->getDecl() : ME->getMemberDecl()), false);
namespace {
/// Checks if the allocator is used in uses_allocators clause to be allowed in
/// target regions.
class AllocatorChecker final : public ConstStmtVisitor<AllocatorChecker, bool> {
DSAStackTy *S = nullptr;
bool VisitDeclRefExpr(const DeclRefExpr *E) {
return S->isUsesAllocatorsDecl(E->getDecl())
DSAStackTy::UsesAllocatorsDeclKind::AllocatorTrait) ==
bool VisitStmt(const Stmt *S) {
for (const Stmt *Child : S->children()) {
if (Child && Visit(Child))
return true;
return false;
explicit AllocatorChecker(DSAStackTy *S) : S(S) {}
} // namespace
static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
ArrayRef<OMPClause *> Clauses) {
assert(!S.CurContext->isDependentContext() &&
"Expected non-dependent context.");
auto AllocateRange =
llvm::make_filter_range(Clauses, OMPAllocateClause::classof);
llvm::DenseMap<CanonicalDeclPtr<Decl>, CanonicalDeclPtr<VarDecl>>
auto PrivateRange = llvm::make_filter_range(Clauses, [](const OMPClause *C) {
return isOpenMPPrivate(C->getClauseKind());
for (OMPClause *Cl : PrivateRange) {
MutableArrayRef<Expr *>::iterator I, It, Et;
if (Cl->getClauseKind() == OMPC_private) {
auto *PC = cast<OMPPrivateClause>(Cl);
I = PC->private_copies().begin();
It = PC->varlist_begin();
Et = PC->varlist_end();
} else if (Cl->getClauseKind() == OMPC_firstprivate) {
auto *PC = cast<OMPFirstprivateClause>(Cl);
I = PC->private_copies().begin();
It = PC->varlist_begin();
Et = PC->varlist_end();
} else if (Cl->getClauseKind() == OMPC_lastprivate) {
auto *PC = cast<OMPLastprivateClause>(Cl);
I = PC->private_copies().begin();
It = PC->varlist_begin();
Et = PC->varlist_end();
} else if (Cl->getClauseKind() == OMPC_linear) {
auto *PC = cast<OMPLinearClause>(Cl);
I = PC->privates().begin();
It = PC->varlist_begin();
Et = PC->varlist_end();
} else if (Cl->getClauseKind() == OMPC_reduction) {
auto *PC = cast<OMPReductionClause>(Cl);
I = PC->privates().begin();
It = PC->varlist_begin();
Et = PC->varlist_end();
} else if (Cl->getClauseKind() == OMPC_task_reduction) {
auto *PC = cast<OMPTaskReductionClause>(Cl);
I = PC->privates().begin();
It = PC->varlist_begin();
Et = PC->varlist_end();
} else if (Cl->getClauseKind() == OMPC_in_reduction) {
auto *PC = cast<OMPInReductionClause>(Cl);
I = PC->privates().begin();
It = PC->varlist_begin();
Et = PC->varlist_end();
} else {
llvm_unreachable("Expected private clause.");
for (Expr *E : llvm::make_range(It, Et)) {
if (!*I) {
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = E;
auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange,
for (OMPClause *C : AllocateRange) {
auto *AC = cast<OMPAllocateClause>(C);
if (S.getLangOpts().OpenMP >= 50 &&
!Stack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>() &&
isOpenMPTargetExecutionDirective(Stack->getCurrentDirective()) &&
AC->getAllocator()) {
Expr *Allocator = AC->getAllocator();
// OpenMP, 2.12.5 target Construct
// Memory allocators that do not appear in a uses_allocators clause cannot
// appear as an allocator in an allocate clause or be used in the target
// region unless a requires directive with the dynamic_allocators clause
// is present in the same compilation unit.
AllocatorChecker Checker(Stack);
if (Checker.Visit(Allocator))
<< Allocator->getSourceRange();
OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind =
getAllocatorKind(S, Stack, AC->getAllocator());
// OpenMP, 2.11.4 allocate Clause, Restrictions.
// For task, taskloop or target directives, allocation requests to memory
// allocators with the trait access set to thread result in unspecified
// behavior.
if (AllocatorKind == OMPAllocateDeclAttr::OMPThreadMemAlloc &&
(isOpenMPTaskingDirective(Stack->getCurrentDirective()) ||
isOpenMPTargetExecutionDirective(Stack->getCurrentDirective()))) {
<< getOpenMPDirectiveName(Stack->getCurrentDirective());
for (Expr *E : AC->varlists()) {
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = E;
auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange);
ValueDecl *VD = Res.first;
DSAStackTy::DSAVarData Data = Stack->getTopDSA(VD, /*FromParent=*/false);
if (!isOpenMPPrivate(Data.CKind)) {
VarDecl *PrivateVD = DeclToCopy[VD];
if (checkPreviousOMPAllocateAttribute(S, Stack, E, PrivateVD,
AllocatorKind, AC->getAllocator()))
applyOMPAllocateAttribute(S, PrivateVD, AllocatorKind, AC->getAllocator(),
StmtResult Sema::ActOnOpenMPExecutableDirective(
OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc) {
StmtResult Res = StmtError();
// First check CancelRegion which is then used in checkNestingOfRegions.
if (checkCancelRegion(*this, Kind, CancelRegion, StartLoc) ||
checkNestingOfRegions(*this, DSAStack, Kind, DirName, CancelRegion,
return StmtError();
llvm::SmallVector<OMPClause *, 8> ClausesWithImplicit;
VarsWithInheritedDSAType VarsWithInheritedDSA;
bool ErrorFound = false;
ClausesWithImplicit.append(Clauses.begin(), Clauses.end());
if (AStmt && !CurContext->isDependentContext()) {
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
// Check default data sharing attributes for referenced variables.
DSAAttrChecker DSAChecker(DSAStack, *this, cast<CapturedStmt>(AStmt));
int ThisCaptureLevel = getOpenMPCaptureLevels(Kind);
Stmt *S = AStmt;
while (--ThisCaptureLevel >= 0)
S = cast<CapturedStmt>(S)->getCapturedStmt();
if (!isOpenMPTargetDataManagementDirective(Kind) &&
!isOpenMPTaskingDirective(Kind)) {
// Visit subcaptures to generate implicit clauses for captured vars.
auto *CS = cast<CapturedStmt>(AStmt);
SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, Kind);
// Ignore outer tasking regions for target directives.
if (CaptureRegions.size() > 1 && CaptureRegions.front() == OMPD_task)
CS = cast<CapturedStmt>(CS->getCapturedStmt());
if (DSAChecker.isErrorFound())
return StmtError();
// Generate list of implicitly defined firstprivate variables.
VarsWithInheritedDSA = DSAChecker.getVarsWithInheritedDSA();
SmallVector<Expr *, 4> ImplicitFirstprivates(
SmallVector<Expr *, 4> ImplicitMaps[OMPC_MAP_delete];
for (unsigned I = 0; I < OMPC_MAP_delete; ++I) {
ArrayRef<Expr *> ImplicitMap =
ImplicitMaps[I].append(ImplicitMap.begin(), ImplicitMap.end());
// Mark taskgroup task_reduction descriptors as implicitly firstprivate.
for (OMPClause *C : Clauses) {
if (auto *IRC = dyn_cast<OMPInReductionClause>(C)) {
for (Expr *E : IRC->taskgroup_descriptors())
if (E)
// OpenMP 5.0, 2.10.1 task Construct
// [detach clause]... The event-handle will be considered as if it was
// specified on a firstprivate clause.
if (auto *DC = dyn_cast<OMPDetachClause>(C))
if (!ImplicitFirstprivates.empty()) {
if (OMPClause *Implicit = ActOnOpenMPFirstprivateClause(
ImplicitFirstprivates, SourceLocation(), SourceLocation(),
SourceLocation())) {
ErrorFound = cast<OMPFirstprivateClause>(Implicit)->varlist_size() !=
} else {
ErrorFound = true;
int ClauseKindCnt = -1;
for (ArrayRef<Expr *> ImplicitMap : ImplicitMaps) {
if (ImplicitMap.empty())
CXXScopeSpec MapperIdScopeSpec;
DeclarationNameInfo MapperId;
auto Kind = static_cast<OpenMPMapClauseKind>(ClauseKindCnt);
if (OMPClause *Implicit = ActOnOpenMPMapClause(
llvm::None, llvm::None, MapperIdScopeSpec, MapperId, Kind,
/*IsMapTypeImplicit=*/true, SourceLocation(), SourceLocation(),
ImplicitMap, OMPVarListLocTy())) {
ErrorFound |=
cast<OMPMapClause>(Implicit)->varlist_size() != ImplicitMap.size();
} else {
ErrorFound = true;
llvm::SmallVector<OpenMPDirectiveKind, 4> AllowedNameModifiers;
switch (Kind) {
case OMPD_parallel:
Res = ActOnOpenMPParallelDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_simd:
Res = ActOnOpenMPSimdDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
if (LangOpts.OpenMP >= 50)
case OMPD_for:
Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
case OMPD_for_simd:
Res = ActOnOpenMPForSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_sections:
Res = ActOnOpenMPSectionsDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_section:
assert(ClausesWithImplicit.empty() &&
"No clauses are allowed for 'omp section' directive");
Res = ActOnOpenMPSectionDirective(AStmt, StartLoc, EndLoc);
case OMPD_single:
Res = ActOnOpenMPSingleDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_master:
assert(ClausesWithImplicit.empty() &&
"No clauses are allowed for 'omp master' directive");
Res = ActOnOpenMPMasterDirective(AStmt, StartLoc, EndLoc);
case OMPD_critical:
Res = ActOnOpenMPCriticalDirective(DirName, ClausesWithImplicit, AStmt,
StartLoc, EndLoc);
case OMPD_parallel_for:
Res = ActOnOpenMPParallelForDirective(ClausesWithImplicit, AStmt, StartLoc,
EndLoc, VarsWithInheritedDSA);
case OMPD_parallel_for_simd:
Res = ActOnOpenMPParallelForSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_parallel_master:
Res = ActOnOpenMPParallelMasterDirective(ClausesWithImplicit, AStmt,
StartLoc, EndLoc);
case OMPD_parallel_sections:
Res = ActOnOpenMPParallelSectionsDirective(ClausesWithImplicit, AStmt,
StartLoc, EndLoc);
case OMPD_task:
Res =
ActOnOpenMPTaskDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
case OMPD_taskyield:
assert(ClausesWithImplicit.empty() &&
"No clauses are allowed for 'omp taskyield' directive");
assert(AStmt == nullptr &&
"No associated statement allowed for 'omp taskyield' directive");
Res = ActOnOpenMPTaskyieldDirective(StartLoc, EndLoc);
case OMPD_barrier:
assert(ClausesWithImplicit.empty() &&
"No clauses are allowed for 'omp barrier' directive");
assert(AStmt == nullptr &&
"No associated statement allowed for 'omp barrier' directive");
Res = ActOnOpenMPBarrierDirective(StartLoc, EndLoc);
case OMPD_taskwait:
assert(ClausesWithImplicit.empty() &&
"No clauses are allowed for 'omp taskwait' directive");
assert(AStmt == nullptr &&
"No associated statement allowed for 'omp taskwait' directive");
Res = ActOnOpenMPTaskwaitDirective(StartLoc, EndLoc);
case OMPD_taskgroup:
Res = ActOnOpenMPTaskgroupDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_flush:
assert(AStmt == nullptr &&
"No associated statement allowed for 'omp flush' directive");
Res = ActOnOpenMPFlushDirective(ClausesWithImplicit, StartLoc, EndLoc);
case OMPD_depobj:
assert(AStmt == nullptr &&
"No associated statement allowed for 'omp depobj' directive");
Res = ActOnOpenMPDepobjDirective(ClausesWithImplicit, StartLoc, EndLoc);
case OMPD_scan:
assert(AStmt == nullptr &&
"No associated statement allowed for 'omp scan' directive");
Res = ActOnOpenMPScanDirective(ClausesWithImplicit, StartLoc, EndLoc);
case OMPD_ordered:
Res = ActOnOpenMPOrderedDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_atomic:
Res = ActOnOpenMPAtomicDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_teams:
Res =
ActOnOpenMPTeamsDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
case OMPD_target:
Res = ActOnOpenMPTargetDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_target_parallel:
Res = ActOnOpenMPTargetParallelDirective(ClausesWithImplicit, AStmt,
StartLoc, EndLoc);
case OMPD_target_parallel_for:
Res = ActOnOpenMPTargetParallelForDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
case OMPD_cancellation_point:
assert(ClausesWithImplicit.empty() &&
"No clauses are allowed for 'omp cancellation point' directive");
assert(AStmt == nullptr && "No associated statement allowed for 'omp "
"cancellation point' directive");
Res = ActOnOpenMPCancellationPointDirective(StartLoc, EndLoc, CancelRegion);
case OMPD_cancel:
assert(AStmt == nullptr &&
"No associated statement allowed for 'omp cancel' directive");
Res = ActOnOpenMPCancelDirective(ClausesWithImplicit, StartLoc, EndLoc,
case OMPD_target_data:
Res = ActOnOpenMPTargetDataDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_target_enter_data:
Res = ActOnOpenMPTargetEnterDataDirective(ClausesWithImplicit, StartLoc,
EndLoc, AStmt);
case OMPD_target_exit_data:
Res = ActOnOpenMPTargetExitDataDirective(ClausesWithImplicit, StartLoc,
EndLoc, AStmt);
case OMPD_taskloop:
Res = ActOnOpenMPTaskLoopDirective(ClausesWithImplicit, AStmt, StartLoc,
EndLoc, VarsWithInheritedDSA);
case OMPD_taskloop_simd:
Res = ActOnOpenMPTaskLoopSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_master_taskloop:
Res = ActOnOpenMPMasterTaskLoopDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
case OMPD_master_taskloop_simd:
Res = ActOnOpenMPMasterTaskLoopSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_parallel_master_taskloop:
Res = ActOnOpenMPParallelMasterTaskLoopDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
case OMPD_parallel_master_taskloop_simd:
Res = ActOnOpenMPParallelMasterTaskLoopSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_distribute:
Res = ActOnOpenMPDistributeDirective(ClausesWithImplicit, AStmt, StartLoc,
EndLoc, VarsWithInheritedDSA);
case OMPD_target_update:
Res = ActOnOpenMPTargetUpdateDirective(ClausesWithImplicit, StartLoc,
EndLoc, AStmt);
case OMPD_distribute_parallel_for:
Res = ActOnOpenMPDistributeParallelForDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
case OMPD_distribute_parallel_for_simd:
Res = ActOnOpenMPDistributeParallelForSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_distribute_simd:
Res = ActOnOpenMPDistributeSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_target_parallel_for_simd:
Res = ActOnOpenMPTargetParallelForSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_target_simd:
Res = ActOnOpenMPTargetSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_teams_distribute:
Res = ActOnOpenMPTeamsDistributeDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
case OMPD_teams_distribute_simd:
Res = ActOnOpenMPTeamsDistributeSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_teams_distribute_parallel_for_simd:
Res = ActOnOpenMPTeamsDistributeParallelForSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_teams_distribute_parallel_for:
Res = ActOnOpenMPTeamsDistributeParallelForDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
case OMPD_target_teams:
Res = ActOnOpenMPTargetTeamsDirective(ClausesWithImplicit, AStmt, StartLoc,
case OMPD_target_teams_distribute:
Res = ActOnOpenMPTargetTeamsDistributeDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
case OMPD_target_teams_distribute_parallel_for:
Res = ActOnOpenMPTargetTeamsDistributeParallelForDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
case OMPD_target_teams_distribute_parallel_for_simd:
Res = ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_target_teams_distribute_simd:
Res = ActOnOpenMPTargetTeamsDistributeSimdDirective(
ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
if (LangOpts.OpenMP >= 50)
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_requires:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
llvm_unreachable("OpenMP Directive is not allowed");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
ErrorFound = Res.isInvalid() || ErrorFound;
// Check variables in the clauses if default(none) or
// default(firstprivate) was specified.
if (DSAStack->getDefaultDSA() == DSA_none ||
DSAStack->getDefaultDSA() == DSA_firstprivate) {
DSAAttrChecker DSAChecker(DSAStack, *this, nullptr);
for (OMPClause *C : Clauses) {
switch (C->getClauseKind()) {
case OMPC_num_threads:
case OMPC_dist_schedule:
// Do not analyse if no parent teams directive.
if (isOpenMPTeamsDirective(Kind))
case OMPC_if:
if (isOpenMPTeamsDirective(Kind) &&
cast<OMPIfClause>(C)->getNameModifier() != OMPD_target)
if (isOpenMPParallelDirective(Kind) &&
isOpenMPTaskLoopDirective(Kind) &&
cast<OMPIfClause>(C)->getNameModifier() != OMPD_parallel)
case OMPC_schedule:
case OMPC_detach:
case OMPC_grainsize:
case OMPC_num_tasks:
case OMPC_final:
case OMPC_priority:
// Do not analyze if no parent parallel directive.
if (isOpenMPParallelDirective(Kind))
case OMPC_ordered:
case OMPC_device:
case OMPC_num_teams:
case OMPC_thread_limit:
case OMPC_hint:
case OMPC_collapse:
case OMPC_safelen:
case OMPC_simdlen:
case OMPC_default:
case OMPC_proc_bind:
case OMPC_private:
case OMPC_firstprivate:
case OMPC_lastprivate:
case OMPC_shared:
case OMPC_reduction:
case OMPC_task_reduction:
case OMPC_in_reduction:
case OMPC_linear:
case OMPC_aligned:
case OMPC_copyin:
case OMPC_copyprivate:
case OMPC_nowait:
case OMPC_untied:
case OMPC_mergeable:
case OMPC_allocate:
case OMPC_read:
case OMPC_write:
case OMPC_update:
case OMPC_capture:
case OMPC_seq_cst:
case OMPC_acq_rel:
case OMPC_acquire:
case OMPC_release:
case OMPC_relaxed:
case OMPC_depend:
case OMPC_threads:
case OMPC_simd:
case OMPC_map:
case OMPC_nogroup:
case OMPC_defaultmap:
case OMPC_to:
case OMPC_from:
case OMPC_use_device_ptr:
case OMPC_use_device_addr:
case OMPC_is_device_ptr:
case OMPC_nontemporal:
case OMPC_order:
case OMPC_destroy:
case OMPC_inclusive:
case OMPC_exclusive:
case OMPC_uses_allocators:
case OMPC_affinity:
case OMPC_allocator:
case OMPC_flush:
case OMPC_depobj:
case OMPC_threadprivate:
case OMPC_uniform:
case OMPC_unknown:
case OMPC_unified_address:
case OMPC_unified_shared_memory:
case OMPC_reverse_offload:
case OMPC_dynamic_allocators:
case OMPC_atomic_default_mem_order:
case OMPC_device_type:
case OMPC_match:
llvm_unreachable("Unexpected clause");
for (Stmt *CC : C->children()) {
if (CC)
for (const auto &P : DSAChecker.getVarsWithInheritedDSA())
VarsWithInheritedDSA[P.getFirst()] = P.getSecond();
for (const auto &P : VarsWithInheritedDSA) {
if (P.getFirst()->isImplicit() || isa<OMPCapturedExprDecl>(P.getFirst()))
ErrorFound = true;
if (DSAStack->getDefaultDSA() == DSA_none ||
DSAStack->getDefaultDSA() == DSA_firstprivate) {
Diag(P.second->getExprLoc(), diag::err_omp_no_dsa_for_variable)
<< P.first << P.second->getSourceRange();
Diag(DSAStack->getDefaultDSALocation(), diag::note_omp_default_dsa_none);
} else if (getLangOpts().OpenMP >= 50) {
<< P.first << P.second->getSourceRange();
if (!AllowedNameModifiers.empty())
ErrorFound = checkIfClauses(*this, Kind, Clauses, AllowedNameModifiers) ||
if (ErrorFound)
return StmtError();
if (!CurContext->isDependentContext() &&
isOpenMPTargetExecutionDirective(Kind) &&
!(DSAStack->hasRequiresDeclWithClause<OMPUnifiedSharedMemoryClause>() ||
DSAStack->hasRequiresDeclWithClause<OMPUnifiedAddressClause>() ||
DSAStack->hasRequiresDeclWithClause<OMPReverseOffloadClause>() ||
DSAStack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>())) {
// Register target to DSA Stack.
return Res;
Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareSimdDirective(
DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS, Expr *Simdlen,
ArrayRef<Expr *> Uniforms, ArrayRef<Expr *> Aligneds,
ArrayRef<Expr *> Alignments, ArrayRef<Expr *> Linears,
ArrayRef<unsigned> LinModifiers, ArrayRef<Expr *> Steps, SourceRange SR) {
assert(Aligneds.size() == Alignments.size());
assert(Linears.size() == LinModifiers.size());
assert(Linears.size() == Steps.size());
if (!DG || DG.get().isNull())
return DeclGroupPtrTy();
const int SimdId = 0;
if (!DG.get().isSingleDecl()) {
Diag(SR.getBegin(), diag::err_omp_single_decl_in_declare_simd_variant)
<< SimdId;
return DG;
Decl *ADecl = DG.get().getSingleDecl();
if (auto *FTD = dyn_cast<FunctionTemplateDecl>(ADecl))
ADecl = FTD->getTemplatedDecl();
auto *FD = dyn_cast<FunctionDecl>(ADecl);
if (!FD) {
Diag(ADecl->getLocation(), diag::err_omp_function_expected) << SimdId;
return DeclGroupPtrTy();
// OpenMP [2.8.2, declare simd construct, Description]
// The parameter of the simdlen clause must be a constant positive integer
// expression.
ExprResult SL;
if (Simdlen)
SL = VerifyPositiveIntegerConstantInClause(Simdlen, OMPC_simdlen);
// OpenMP [2.8.2, declare simd construct, Description]
// The special this pointer can be used as if was one of the arguments to the
// function in any of the linear, aligned, or uniform clauses.
// The uniform clause declares one or more arguments to have an invariant
// value for all concurrent invocations of the function in the execution of a
// single SIMD loop.
llvm::DenseMap<const Decl *, const Expr *> UniformedArgs;
const Expr *UniformedLinearThis = nullptr;
for (const Expr *E : Uniforms) {
E = E->IgnoreParenImpCasts();
if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl()))
if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
->getCanonicalDecl() == PVD->getCanonicalDecl()) {
UniformedArgs.try_emplace(PVD->getCanonicalDecl(), E);
if (isa<CXXThisExpr>(E)) {
UniformedLinearThis = E;
Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
<< FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
// OpenMP [2.8.2, declare simd construct, Description]
// The aligned clause declares that the object to which each list item points
// is aligned to the number of bytes expressed in the optional parameter of
// the aligned clause.
// The special this pointer can be used as if was one of the arguments to the
// function in any of the linear, aligned, or uniform clauses.
// The type of list items appearing in the aligned clause must be array,
// pointer, reference to array, or reference to pointer.
llvm::DenseMap<const Decl *, const Expr *> AlignedArgs;
const Expr *AlignedThis = nullptr;
for (const Expr *E : Aligneds) {
E = E->IgnoreParenImpCasts();
if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
const VarDecl *CanonPVD = PVD->getCanonicalDecl();
if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
->getCanonicalDecl() == CanonPVD) {
// OpenMP [2.8.1, simd construct, Restrictions]
// A list-item cannot appear in more than one aligned clause.
if (AlignedArgs.count(CanonPVD) > 0) {
Diag(E->getExprLoc(), diag::err_omp_used_in_clause_twice)
<< 1 << getOpenMPClauseName(OMPC_aligned)
<< E->getSourceRange();
<< getOpenMPClauseName(OMPC_aligned);
AlignedArgs[CanonPVD] = E;
QualType QTy = PVD->getType()
const Type *Ty = QTy.getTypePtrOrNull();
if (!Ty || (!Ty->isArrayType() && !Ty->isPointerType())) {
Diag(E->getExprLoc(), diag::err_omp_aligned_expected_array_or_ptr)
<< QTy << getLangOpts().CPlusPlus << E->getSourceRange();
Diag(PVD->getLocation(), diag::note_previous_decl) << PVD;
if (isa<CXXThisExpr>(E)) {
if (AlignedThis) {
Diag(E->getExprLoc(), diag::err_omp_used_in_clause_twice)
<< 2 << getOpenMPClauseName(OMPC_aligned) << E->getSourceRange();
Diag(AlignedThis->getExprLoc(), diag::note_omp_explicit_dsa)
<< getOpenMPClauseName(OMPC_aligned);
AlignedThis = E;
Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
<< FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
// The optional parameter of the aligned clause, alignment, must be a constant
// positive integer expression. If no optional parameter is specified,
// implementation-defined default alignments for SIMD instructions on the
// target platforms are assumed.
SmallVector<const Expr *, 4> NewAligns;
for (Expr *E : Alignments) {
ExprResult Align;
if (E)
Align = VerifyPositiveIntegerConstantInClause(E, OMPC_aligned);
// OpenMP [2.8.2, declare simd construct, Description]
// The linear clause declares one or more list items to be private to a SIMD
// lane and to have a linear relationship with respect to the iteration space
// of a loop.
// The special this pointer can be used as if was one of the arguments to the
// function in any of the linear, aligned, or uniform clauses.
// When a linear-step expression is specified in a linear clause it must be
// either a constant integer expression or an integer-typed parameter that is
// specified in a uniform clause on the directive.
llvm::DenseMap<const Decl *, const Expr *> LinearArgs;
const bool IsUniformedThis = UniformedLinearThis != nullptr;
auto MI = LinModifiers.begin();
for (const Expr *E : Linears) {
auto LinKind = static_cast<OpenMPLinearClauseKind>(*MI);
E = E->IgnoreParenImpCasts();
if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
const VarDecl *CanonPVD = PVD->getCanonicalDecl();
if (FD->getNumParams() > PVD->getFunctionScopeIndex() &&
->getCanonicalDecl() == CanonPVD) {
// OpenMP [, linear Clause, Restrictions]
// A list-item cannot appear in more than one linear clause.
if (LinearArgs.count(CanonPVD) > 0) {
Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(OMPC_linear)
<< getOpenMPClauseName(OMPC_linear) << E->getSourceRange();
<< getOpenMPClauseName(OMPC_linear);
// Each argument can appear in at most one uniform or linear clause.
if (UniformedArgs.count(CanonPVD) > 0) {
Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(OMPC_linear)
<< getOpenMPClauseName(OMPC_uniform) << E->getSourceRange();
<< getOpenMPClauseName(OMPC_uniform);
LinearArgs[CanonPVD] = E;
if (E->isValueDependent() || E->isTypeDependent() ||
E->isInstantiationDependent() ||
(void)CheckOpenMPLinearDecl(CanonPVD, E->getExprLoc(), LinKind,
if (isa<CXXThisExpr>(E)) {
if (UniformedLinearThis) {
Diag(E->getExprLoc(), diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(OMPC_linear)
<< getOpenMPClauseName(IsUniformedThis ? OMPC_uniform : OMPC_linear)
<< E->getSourceRange();
Diag(UniformedLinearThis->getExprLoc(), diag::note_omp_explicit_dsa)
<< getOpenMPClauseName(IsUniformedThis ? OMPC_uniform
: OMPC_linear);
UniformedLinearThis = E;
if (E->isValueDependent() || E->isTypeDependent() ||
E->isInstantiationDependent() || E->containsUnexpandedParameterPack())
(void)CheckOpenMPLinearDecl(/*D=*/nullptr, E->getExprLoc(), LinKind,
E->getType(), /*IsDeclareSimd=*/true);
Diag(E->getExprLoc(), diag::err_omp_param_or_this_in_clause)
<< FD->getDeclName() << (isa<CXXMethodDecl>(ADecl) ? 1 : 0);
Expr *Step = nullptr;
Expr *NewStep = nullptr;
SmallVector<Expr *, 4> NewSteps;
for (Expr *E : Steps) {
// Skip the same step expression, it was checked already.
if (Step == E || !E) {
NewSteps.push_back(E ? NewStep : nullptr);
Step = E;
if (const auto *DRE = dyn_cast<DeclRefExpr>(Step))
if (const auto *PVD = dyn_cast<ParmVarDecl>(DRE->getDecl())) {
const VarDecl *CanonPVD = PVD->getCanonicalDecl();
if (UniformedArgs.count(CanonPVD) == 0) {
Diag(Step->getExprLoc(), diag::err_omp_expected_uniform_param)
<< Step->getSourceRange();
} else if (E->isValueDependent() || E->isTypeDependent() ||
E->isInstantiationDependent() ||
E->containsUnexpandedParameterPack() ||
CanonPVD->getType()->hasIntegerRepresentation()) {
} else {
Diag(Step->getExprLoc(), diag::err_omp_expected_int_param)
<< Step->getSourceRange();
NewStep = Step;
if (Step && !Step->isValueDependent() && !Step->isTypeDependent() &&
!Step->isInstantiationDependent() &&
!Step->containsUnexpandedParameterPack()) {
NewStep = PerformOpenMPImplicitIntegerConversion(Step->getExprLoc(), Step)
if (NewStep)
NewStep = VerifyIntegerConstantExpression(NewStep).get();
auto *NewAttr = OMPDeclareSimdDeclAttr::CreateImplicit(
Context, BS, SL.get(), const_cast<Expr **>(,
Uniforms.size(), const_cast<Expr **>(, Aligneds.size(),
const_cast<Expr **>(, NewAligns.size(),
const_cast<Expr **>(, Linears.size(),
const_cast<unsigned *>(, LinModifiers.size(),, NewSteps.size(), SR);
return DG;
static void setPrototype(Sema &S, FunctionDecl *FD, FunctionDecl *FDWithProto,
QualType NewType) {
assert(NewType->isFunctionProtoType() &&
"Expected function type with prototype.");
assert(FD->getType()->isFunctionNoProtoType() &&
"Expected function with type with no prototype.");
assert(FDWithProto->getType()->isFunctionProtoType() &&
"Expected function with prototype.");
// Synthesize parameters with the same types.
SmallVector<ParmVarDecl *, 16> Params;
for (const ParmVarDecl *P : FDWithProto->parameters()) {
auto *Param = ParmVarDecl::Create(S.getASTContext(), FD, SourceLocation(),
SourceLocation(), nullptr, P->getType(),
/*TInfo=*/nullptr, SC_None, nullptr);
Param->setScopeInfo(0, Params.size());
Sema::OMPDeclareVariantScope::OMPDeclareVariantScope(OMPTraitInfo &TI)
: TI(&TI), NameSuffix(TI.getMangledName()) {}
FunctionDecl *
Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(Scope *S,
Declarator &D) {
IdentifierInfo *BaseII = D.getIdentifier();
LookupResult Lookup(*this, DeclarationName(BaseII), D.getIdentifierLoc(),
LookupParsedName(Lookup, S, &D.getCXXScopeSpec());
TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
QualType FType = TInfo->getType();
bool IsConstexpr = D.getDeclSpec().getConstexprSpecifier() == CSK_constexpr;
bool IsConsteval = D.getDeclSpec().getConstexprSpecifier() == CSK_consteval;
FunctionDecl *BaseFD = nullptr;
for (auto *Candidate : Lookup) {
auto *UDecl = dyn_cast<FunctionDecl>(Candidate->getUnderlyingDecl());
if (!UDecl)
// Don't specialize constexpr/consteval functions with
// non-constexpr/consteval functions.
if (UDecl->isConstexpr() && !IsConstexpr)
if (UDecl->isConsteval() && !IsConsteval)
QualType NewType = Context.mergeFunctionTypes(
FType, UDecl->getType(), /* OfBlockPointer */ false,
/* Unqualified */ false, /* AllowCXX */ true);
if (NewType.isNull())
// Found a base!
BaseFD = UDecl;
if (!BaseFD) {
BaseFD = cast<FunctionDecl>(ActOnDeclarator(S, D));
OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back();
std::string MangledName;
MangledName += D.getIdentifier()->getName();
MangledName += getOpenMPVariantManglingSeparatorStr();
MangledName += DVScope.NameSuffix;
IdentifierInfo &VariantII = Context.Idents.get(MangledName);
D.SetIdentifier(&VariantII, D.getBeginLoc());
return BaseFD;
void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
FunctionDecl *FD, FunctionDecl *BaseFD) {
// Do not mark function as is used to prevent its emission if this is the
// only place where it is used.
EnterExpressionEvaluationContext Unevaluated(
*this, Sema::ExpressionEvaluationContext::Unevaluated);
Expr *VariantFuncRef = DeclRefExpr::Create(
Context, NestedNameSpecifierLoc(), SourceLocation(), FD,
/* RefersToEnclosingVariableOrCapture */ false,
/* NameLoc */ FD->getLocation(), FD->getType(), ExprValueKind::VK_RValue);
OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back();
auto *OMPDeclareVariantA = OMPDeclareVariantAttr::CreateImplicit(
Context, VariantFuncRef, DVScope.TI);
ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
SourceLocation LParenLoc,
MultiExprArg ArgExprs,
SourceLocation RParenLoc, Expr *ExecConfig) {
// The common case is a regular call we do not want to specialize at all. Try
// to make that case fast by bailing early.
CallExpr *CE = dyn_cast<CallExpr>(Call.get());
if (!CE)
return Call;
FunctionDecl *CalleeFnDecl = CE->getDirectCallee();
if (!CalleeFnDecl)
return Call;
if (!CalleeFnDecl->hasAttr<OMPDeclareVariantAttr>())
return Call;
ASTContext &Context = getASTContext();
OMPContext OMPCtx(getLangOpts().OpenMPIsDevice,
SmallVector<Expr *, 4> Exprs;
SmallVector<VariantMatchInfo, 4> VMIs;
while (CalleeFnDecl) {
for (OMPDeclareVariantAttr *A :
CalleeFnDecl->specific_attrs<OMPDeclareVariantAttr>()) {
Expr *VariantRef = A->getVariantFuncRef();
VariantMatchInfo VMI;
OMPTraitInfo &TI = A->getTraitInfo();
TI.getAsVariantMatchInfo(Context, VMI);
if (!isVariantApplicableInContext(VMI, OMPCtx, /* DeviceSetOnly */ false))
CalleeFnDecl = CalleeFnDecl->getPreviousDecl();
ExprResult NewCall;
do {
int BestIdx = getBestVariantMatchForContext(VMIs, OMPCtx);
if (BestIdx < 0)
return Call;
Expr *BestExpr = cast<DeclRefExpr>(Exprs[BestIdx]);
Decl *BestDecl = cast<DeclRefExpr>(BestExpr)->getDecl();
// Try to build a (member) call expression for the current best applicable
// variant expression. We allow this to fail in which case we continue
// with the next best variant expression. The fail case is part of the
// implementation defined behavior in the OpenMP standard when it talks
// about what differences in the function prototypes: "Any differences
// that the specific OpenMP context requires in the prototype of the
// variant from the base function prototype are implementation defined."
// This wording is there to allow the specialized variant to have a
// different type than the base function. This is intended and OK but if
// we cannot create a call the difference is not in the "implementation
// defined range" we allow.
Sema::TentativeAnalysisScope Trap(*this);
if (auto *SpecializedMethod = dyn_cast<CXXMethodDecl>(BestDecl)) {
auto *MemberCall = dyn_cast<CXXMemberCallExpr>(CE);
BestExpr = MemberExpr::CreateImplicit(
Context, MemberCall->getImplicitObjectArgument(),
/* IsArrow */ false, SpecializedMethod, Context.BoundMemberTy,
MemberCall->getValueKind(), MemberCall->getObjectKind());
NewCall = BuildCallExpr(Scope, BestExpr, LParenLoc, ArgExprs, RParenLoc,
if (NewCall.isUsable())
VMIs.erase(VMIs.begin() + BestIdx);
Exprs.erase(Exprs.begin() + BestIdx);
} while (!VMIs.empty());
if (!NewCall.isUsable())
return Call;
return PseudoObjectExpr::Create(Context, CE, {NewCall.get()}, 0);
Optional<std::pair<FunctionDecl *, Expr *>>
Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
Expr *VariantRef, OMPTraitInfo &TI,
SourceRange SR) {
if (!DG || DG.get().isNull())
return None;
const int VariantId = 1;
// Must be applied only to single decl.
if (!DG.get().isSingleDecl()) {
Diag(SR.getBegin(), diag::err_omp_single_decl_in_declare_simd_variant)
<< VariantId << SR;
return None;
Decl *ADecl = DG.get().getSingleDecl();
if (auto *FTD = dyn_cast<FunctionTemplateDecl>(ADecl))
ADecl = FTD->getTemplatedDecl();
// Decl must be a function.
auto *FD = dyn_cast<FunctionDecl>(ADecl);
if (!FD) {
Diag(ADecl->getLocation(), diag::err_omp_function_expected)
<< VariantId << SR;
return None;
auto &&HasMultiVersionAttributes = [](const FunctionDecl *FD) {
return FD->hasAttrs() &&
(FD->hasAttr<CPUDispatchAttr>() || FD->hasAttr<CPUSpecificAttr>() ||
// OpenMP is not compatible with CPU-specific attributes.
if (HasMultiVersionAttributes(FD)) {
Diag(FD->getLocation(), diag::err_omp_declare_variant_incompat_attributes)
<< SR;
return None;
// Allow #pragma omp declare variant only if the function is not used.
if (FD->isUsed(false))
Diag(SR.getBegin(), diag::warn_omp_declare_variant_after_used)
<< FD->getLocation();
// Check if the function was emitted already.
const FunctionDecl *Definition;
if (!FD->isThisDeclarationADefinition() && FD->isDefined(Definition) &&
(LangOpts.EmitAllDecls || Context.DeclMustBeEmitted(Definition)))
Diag(SR.getBegin(), diag::warn_omp_declare_variant_after_emitted)
<< FD->getLocation();
// The VariantRef must point to function.
if (!VariantRef) {
Diag(SR.getBegin(), diag::err_omp_function_expected) << VariantId;
return None;
auto ShouldDelayChecks = [](Expr *&E, bool) {
return E && (E->isTypeDependent() || E->isValueDependent() ||
E->containsUnexpandedParameterPack() ||
// Do not check templates, wait until instantiation.
if (FD->isDependentContext() || ShouldDelayChecks(VariantRef, false) ||
return std::make_pair(FD, VariantRef);
// Deal with non-constant score and user condition expressions.
auto HandleNonConstantScoresAndConditions = [this](Expr *&E,
bool IsScore) -> bool {
llvm::APSInt Result;
if (!E || E->isIntegerConstantExpr(Result, Context))
return false;
if (IsScore) {
// We warn on non-constant scores and pretend they were not present.
Diag(E->getExprLoc(), diag::warn_omp_declare_variant_score_not_constant)
<< E;
E = nullptr;
} else {
// We could replace a non-constant user condition with "false" but we
// will soon need to handle these anyway for the dynamic version of
// OpenMP context selectors.
<< E;
return true;
if (TI.anyScoreOrCondition(HandleNonConstantScoresAndConditions))
return None;
// Convert VariantRef expression to the type of the original function to
// resolve possible conflicts.
ExprResult VariantRefCast;
if (LangOpts.CPlusPlus) {
QualType FnPtrType;
auto *Method = dyn_cast<CXXMethodDecl>(FD);
if (Method && !Method->isStatic()) {
const Type *ClassType =
FnPtrType = Context.getMemberPointerType(FD->getType(), ClassType);
ExprResult ER;
// Build adrr_of unary op to correctly handle type checks for member
// functions.
Sema::TentativeAnalysisScope Trap(*this);
ER = CreateBuiltinUnaryOp(VariantRef->getBeginLoc(), UO_AddrOf,
if (!ER.isUsable()) {
Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
<< VariantId << VariantRef->getSourceRange();
return None;
VariantRef = ER.get();
} else {
FnPtrType = Context.getPointerType(FD->getType());
ImplicitConversionSequence ICS =
TryImplicitConversion(VariantRef, FnPtrType.getUnqualifiedType(),
if (ICS.isFailure()) {
<< VariantRef->getType()
<< ((Method && !Method->isStatic()) ? FnPtrType : FD->getType())
<< VariantRef->getSourceRange();
return None;
VariantRefCast = PerformImplicitConversion(
VariantRef, FnPtrType.getUnqualifiedType(), AA_Converting);
if (!VariantRefCast.isUsable())
return None;
// Drop previously built artificial addr_of unary op for member functions.
if (Method && !Method->isStatic()) {
Expr *PossibleAddrOfVariantRef = VariantRefCast.get();
if (auto *UO = dyn_cast<UnaryOperator>(
VariantRefCast = UO->getSubExpr();
} else {
VariantRefCast = VariantRef;
ExprResult ER = CheckPlaceholderExpr(VariantRefCast.get());
if (!ER.isUsable() ||
!ER.get()->IgnoreParenImpCasts()->getType()->isFunctionType()) {
Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
<< VariantId << VariantRef->getSourceRange();
return None;
// The VariantRef must point to function.
auto *DRE = dyn_cast<DeclRefExpr>(ER.get()->IgnoreParenImpCasts());
if (!DRE) {
Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
<< VariantId << VariantRef->getSourceRange();
return None;
auto *NewFD = dyn_cast_or_null<FunctionDecl>(DRE->getDecl());
if (!NewFD) {
Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
<< VariantId << VariantRef->getSourceRange();
return None;
// Check if function types are compatible in C.
if (!LangOpts.CPlusPlus) {
QualType NewType =
Context.mergeFunctionTypes(FD->getType(), NewFD->getType());
if (NewType.isNull()) {
<< NewFD->getType() << FD->getType() << VariantRef->getSourceRange();
return None;
if (NewType->isFunctionProtoType()) {
if (FD->getType()->isFunctionNoProtoType())
setPrototype(*this, FD, NewFD, NewType);
else if (NewFD->getType()->isFunctionNoProtoType())
setPrototype(*this, NewFD, FD, NewType);
// Check if variant function is not marked with declare variant directive.
if (NewFD->hasAttrs() && NewFD->hasAttr<OMPDeclareVariantAttr>()) {
<< VariantRef->getSourceRange();
SourceRange SR =
Diag(SR.getBegin(), diag::note_omp_marked_declare_variant_here) << SR;
return None;
enum DoesntSupport {
VirtFuncs = 1,
Constructors = 3,
Destructors = 4,
DeletedFuncs = 5,
DefaultedFuncs = 6,
ConstexprFuncs = 7,
ConstevalFuncs = 8,
if (const auto *CXXFD = dyn_cast<CXXMethodDecl>(FD)) {
if (CXXFD->isVirtual()) {
Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
<< VirtFuncs;
return None;
if (isa<CXXConstructorDecl>(FD)) {
Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
<< Constructors;
return None;
if (isa<CXXDestructorDecl>(FD)) {
Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
<< Destructors;
return None;
if (FD->isDeleted()) {
Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
<< DeletedFuncs;
return None;
if (FD->isDefaulted()) {
Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
<< DefaultedFuncs;
return None;
if (FD->isConstexpr()) {
Diag(FD->getLocation(), diag::err_omp_declare_variant_doesnt_support)
<< (NewFD->isConsteval() ? ConstevalFuncs : ConstexprFuncs);
return None;
// Check general compatibility.
if (areMultiversionVariantFunctionsCompatible(
FD, NewFD, PartialDiagnostic::NullDiagnostic(),
<< FD->getLocation()),
/*TemplatesSupported=*/true, /*ConstexprSupported=*/false,
return None;
return std::make_pair(FD, cast<Expr>(DRE));
void Sema::ActOnOpenMPDeclareVariantDirective(FunctionDecl *FD,
Expr *VariantRef,
OMPTraitInfo &TI,
SourceRange SR) {
auto *NewAttr =
OMPDeclareVariantAttr::CreateImplicit(Context, VariantRef, &TI, SR);
StmtResult Sema::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
return OMPParallelDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
namespace {
/// Iteration space of a single for loop.
struct LoopIterationSpace final {
/// True if the condition operator is the strict compare operator (<, > or
/// !=).
bool IsStrictCompare = false;
/// Condition of the loop.
Expr *PreCond = nullptr;
/// This expression calculates the number of iterations in the loop.
/// It is always possible to calculate it before starting the loop.
Expr *NumIterations = nullptr;
/// The loop counter variable.
Expr *CounterVar = nullptr;
/// Private loop counter variable.
Expr *PrivateCounterVar = nullptr;
/// This is initializer for the initial value of #CounterVar.
Expr *CounterInit = nullptr;
/// This is step for the #CounterVar used to generate its update:
/// #CounterVar = #CounterInit + #CounterStep * CurrentIteration.
Expr *CounterStep = nullptr;
/// Should step be subtracted?
bool Subtract = false;
/// Source range of the loop init.
SourceRange InitSrcRange;
/// Source range of the loop condition.
SourceRange CondSrcRange;
/// Source range of the loop increment.
SourceRange IncSrcRange;
/// Minimum value that can have the loop control variable. Used to support
/// non-rectangular loops. Applied only for LCV with the non-iterator types,
/// since only such variables can be used in non-loop invariant expressions.
Expr *MinValue = nullptr;
/// Maximum value that can have the loop control variable. Used to support
/// non-rectangular loops. Applied only for LCV with the non-iterator type,
/// since only such variables can be used in non-loop invariant expressions.
Expr *MaxValue = nullptr;
/// true, if the lower bound depends on the outer loop control var.
bool IsNonRectangularLB = false;
/// true, if the upper bound depends on the outer loop control var.
bool IsNonRectangularUB = false;
/// Index of the loop this loop depends on and forms non-rectangular loop
/// nest.
unsigned LoopDependentIdx = 0;
/// Final condition for the non-rectangular loop nest support. It is used to
/// check that the number of iterations for this particular counter must be
/// finished.
Expr *FinalCondition = nullptr;
/// Helper class for checking canonical form of the OpenMP loops and
/// extracting iteration space of each loop in the loop nest, that will be used
/// for IR generation.
class OpenMPIterationSpaceChecker {
/// Reference to Sema.
Sema &SemaRef;
/// Data-sharing stack.
DSAStackTy &Stack;
/// A location for diagnostics (when there is no some better location).
SourceLocation DefaultLoc;
/// A location for diagnostics (when increment is not compatible).
SourceLocation ConditionLoc;
/// A source location for referring to loop init later.
SourceRange InitSrcRange;
/// A source location for referring to condition later.
SourceRange ConditionSrcRange;
/// A source location for referring to increment later.
SourceRange IncrementSrcRange;
/// Loop variable.
ValueDecl *LCDecl = nullptr;
/// Reference to loop variable.
Expr *LCRef = nullptr;
/// Lower bound (initializer for the var).
Expr *LB = nullptr;
/// Upper bound.
Expr *UB = nullptr;
/// Loop step (increment).
Expr *Step = nullptr;
/// This flag is true when condition is one of:
/// Var < UB
/// Var <= UB
/// UB > Var
/// UB >= Var
/// This will have no value when the condition is !=
llvm::Optional<bool> TestIsLessOp;
/// This flag is true when condition is strict ( < or > ).
bool TestIsStrictOp = false;
/// This flag is true when step is subtracted on each iteration.
bool SubtractStep = false;
/// The outer loop counter this loop depends on (if any).
const ValueDecl *DepDecl = nullptr;
/// Contains number of loop (starts from 1) on which loop counter init
/// expression of this loop depends on.
Optional<unsigned> InitDependOnLC;
/// Contains number of loop (starts from 1) on which loop counter condition
/// expression of this loop depends on.
Optional<unsigned> CondDependOnLC;
/// Checks if the provide statement depends on the loop counter.
Optional<unsigned> doesDependOnLoopCounter(const Stmt *S, bool IsInitializer);
/// Original condition required for checking of the exit condition for
/// non-rectangular loop.
Expr *Condition = nullptr;
OpenMPIterationSpaceChecker(Sema &SemaRef, DSAStackTy &Stack,
SourceLocation DefaultLoc)
: SemaRef(SemaRef), Stack(Stack), DefaultLoc(DefaultLoc),
ConditionLoc(DefaultLoc) {}
/// Check init-expr for canonical loop form and save loop counter
/// variable - #Var and its initialization value - #LB.
bool checkAndSetInit(Stmt *S, bool EmitDiags = true);
/// Check test-expr for canonical form, save upper-bound (#UB), flags
/// for less/greater and for strict/non-strict comparison.
bool checkAndSetCond(Expr *S);
/// Check incr-expr for canonical loop form and return true if it
/// does not conform, otherwise save loop step (#Step).
bool checkAndSetInc(Expr *S);
/// Return the loop counter variable.
ValueDecl *getLoopDecl() const { return LCDecl; }
/// Return the reference expression to loop counter variable.
Expr *getLoopDeclRefExpr() const { return LCRef; }
/// Source range of the loop init.
SourceRange getInitSrcRange() const { return InitSrcRange; }
/// Source range of the loop condition.
SourceRange getConditionSrcRange() const { return ConditionSrcRange; }
/// Source range of the loop increment.
SourceRange getIncrementSrcRange() const { return IncrementSrcRange; }
/// True if the step should be subtracted.
bool shouldSubtractStep() const { return SubtractStep; }
/// True, if the compare operator is strict (<, > or !=).
bool isStrictTestOp() const { return TestIsStrictOp; }
/// Build the expression to calculate the number of iterations.
Expr *buildNumIterations(
Scope *S, ArrayRef<LoopIterationSpace> ResultIterSpaces, bool LimitedType,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) const;
/// Build the precondition expression for the loops.
Expr *
buildPreCond(Scope *S, Expr *Cond,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) const;
/// Build reference expression to the counter be used for codegen.
DeclRefExpr *
buildCounterVar(llvm::MapVector<const Expr *, DeclRefExpr *> &Captures,
DSAStackTy &DSA) const;
/// Build reference expression to the private counter be used for
/// codegen.
Expr *buildPrivateCounterVar() const;
/// Build initialization of the counter be used for codegen.
Expr *buildCounterInit() const;
/// Build step of the counter be used for codegen.
Expr *buildCounterStep() const;
/// Build loop data with counter value for depend clauses in ordered
/// directives.
Expr *
buildOrderedLoopData(Scope *S, Expr *Counter,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures,
SourceLocation Loc, Expr *Inc = nullptr,
OverloadedOperatorKind OOK = OO_Amp);
/// Builds the minimum value for the loop counter.
std::pair<Expr *, Expr *> buildMinMaxValues(
Scope *S, llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) const;
/// Builds final condition for the non-rectangular loops.
Expr *buildFinalCondition(Scope *S) const;
/// Return true if any expression is dependent.
bool dependent() const;
/// Returns true if the initializer forms non-rectangular loop.
bool doesInitDependOnLC() const { return InitDependOnLC.hasValue(); }
/// Returns true if the condition forms non-rectangular loop.
bool doesCondDependOnLC() const { return CondDependOnLC.hasValue(); }
/// Returns index of the loop we depend on (starting from 1), or 0 otherwise.
unsigned getLoopDependentIdx() const {
return InitDependOnLC.getValueOr(CondDependOnLC.getValueOr(0));
/// Check the right-hand side of an assignment in the increment
/// expression.
bool checkAndSetIncRHS(Expr *RHS);
/// Helper to set loop counter variable and its initializer.
bool setLCDeclAndLB(ValueDecl *NewLCDecl, Expr *NewDeclRefExpr, Expr *NewLB,
bool EmitDiags);
/// Helper to set upper bound.
bool setUB(Expr *NewUB, llvm::Optional<bool> LessOp, bool StrictOp,
SourceRange SR, SourceLocation SL);
/// Helper to set loop increment.
bool setStep(Expr *NewStep, bool Subtract);
bool OpenMPIterationSpaceChecker::dependent() const {
if (!LCDecl) {
assert(!LB && !UB && !Step);
return false;
return LCDecl->getType()->isDependentType() ||
(LB && LB->isValueDependent()) || (UB && UB->isValueDependent()) ||
(Step && Step->isValueDependent());
bool OpenMPIterationSpaceChecker::setLCDeclAndLB(ValueDecl *NewLCDecl,
Expr *NewLCRefExpr,
Expr *NewLB, bool EmitDiags) {
// State consistency checking to ensure correct usage.
assert(LCDecl == nullptr && LB == nullptr && LCRef == nullptr &&
UB == nullptr && Step == nullptr && !TestIsLessOp && !TestIsStrictOp);
if (!NewLCDecl || !NewLB)
return true;
LCDecl = getCanonicalDecl(NewLCDecl);
LCRef = NewLCRefExpr;
if (auto *CE = dyn_cast_or_null<CXXConstructExpr>(NewLB))
if (const CXXConstructorDecl *Ctor = CE->getConstructor())
if ((Ctor->isCopyOrMoveConstructor() ||
Ctor->isConvertingConstructor(/*AllowExplicit=*/false)) &&
CE->getNumArgs() > 0 && CE->getArg(0) != nullptr)
NewLB = CE->getArg(0)->IgnoreParenImpCasts();
LB = NewLB;
if (EmitDiags)
InitDependOnLC = doesDependOnLoopCounter(LB, /*IsInitializer=*/true);
return false;
bool OpenMPIterationSpaceChecker::setUB(Expr *NewUB,
llvm::Optional<bool> LessOp,
bool StrictOp, SourceRange SR,
SourceLocation SL) {
// State consistency checking to ensure correct usage.
assert(LCDecl != nullptr && LB != nullptr && UB == nullptr &&
Step == nullptr && !TestIsLessOp && !TestIsStrictOp);
if (!NewUB)
return true;
UB = NewUB;
if (LessOp)
TestIsLessOp = LessOp;
TestIsStrictOp = StrictOp;
ConditionSrcRange = SR;
ConditionLoc = SL;
CondDependOnLC = doesDependOnLoopCounter(UB, /*IsInitializer=*/false);
return false;
bool OpenMPIterationSpaceChecker::setStep(Expr *NewStep, bool Subtract) {
// State consistency checking to ensure correct usage.
assert(LCDecl != nullptr && LB != nullptr && Step == nullptr);
if (!NewStep)
return true;
if (!NewStep->isValueDependent()) {
// Check that the step is integer expression.
SourceLocation StepLoc = NewStep->getBeginLoc();
ExprResult Val = SemaRef.PerformOpenMPImplicitIntegerConversion(
StepLoc, getExprAsWritten(NewStep));
if (Val.isInvalid())
return true;
NewStep = Val.get();
// OpenMP [2.6, Canonical Loop Form, Restrictions]
// If test-expr is of form var relational-op b and relational-op is < or
// <= then incr-expr must cause var to increase on each iteration of the
// loop. If test-expr is of form var relational-op b and relational-op is
// > or >= then incr-expr must cause var to decrease on each iteration of
// the loop.
// If test-expr is of form b relational-op var and relational-op is < or
// <= then incr-expr must cause var to decrease on each iteration of the
// loop. If test-expr is of form b relational-op var and relational-op is
// > or >= then incr-expr must cause var to increase on each iteration of
// the loop.
llvm::APSInt Result;
bool IsConstant = NewStep->isIntegerConstantExpr(Result, SemaRef.Context);
bool IsUnsigned = !NewStep->getType()->hasSignedIntegerRepresentation();
bool IsConstNeg =
IsConstant && Result.isSigned() && (Subtract != Result.isNegative());
bool IsConstPos =
IsConstant && Result.isSigned() && (Subtract == Result.isNegative());
bool IsConstZero = IsConstant && !Result.getBoolValue();
// != with increment is treated as <; != with decrement is treated as >
if (!TestIsLessOp.hasValue())
TestIsLessOp = IsConstPos || (IsUnsigned && !Subtract);
if (UB && (IsConstZero ||
(TestIsLessOp.getValue() ?
(IsConstNeg || (IsUnsigned && Subtract)) :
(IsConstPos || (IsUnsigned && !Subtract))))) {
<< LCDecl << TestIsLessOp.getValue() << NewStep->getSourceRange();
<< TestIsLessOp.getValue() << ConditionSrcRange;
return true;
if (TestIsLessOp.getValue() == Subtract) {
NewStep =
SemaRef.CreateBuiltinUnaryOp(NewStep->getExprLoc(), UO_Minus, NewStep)
Subtract = !Subtract;
Step = NewStep;
SubtractStep = Subtract;
return false;
namespace {
/// Checker for the non-rectangular loops. Checks if the initializer or
/// condition expression references loop counter variable.
class LoopCounterRefChecker final
: public ConstStmtVisitor<LoopCounterRefChecker, bool> {
Sema &SemaRef;
DSAStackTy &Stack;
const ValueDecl *CurLCDecl = nullptr;
const ValueDecl *DepDecl = nullptr;
const ValueDecl *PrevDepDecl = nullptr;
bool IsInitializer = true;
unsigned BaseLoopId = 0;
bool checkDecl(const Expr *E, const ValueDecl *VD) {
if (getCanonicalDecl(VD) == getCanonicalDecl(CurLCDecl)) {
SemaRef.Diag(E->getExprLoc(), diag::err_omp_stmt_depends_on_loop_counter)
<< (IsInitializer ? 0 : 1);
return false;
const auto &&Data = Stack.isLoopControlVariable(VD);
// OpenMP, 2.9.1 Canonical Loop Form, Restrictions.
// The type of the loop iterator on which we depend may not have a random
// access iterator type.
if (Data.first && VD->getType()->isRecordType()) {
SmallString<128> Name;
llvm::raw_svector_ostream OS(Name);
VD->getNameForDiagnostic(OS, SemaRef.getPrintingPolicy(),
<< OS.str();
SemaRef.Diag(VD->getLocation(), diag::note_previous_decl) << VD;
return false;
if (Data.first &&
(DepDecl || (PrevDepDecl &&
getCanonicalDecl(VD) != getCanonicalDecl(PrevDepDecl)))) {
if (!DepDecl && PrevDepDecl)
DepDecl = PrevDepDecl;
SmallString<128> Name;
llvm::raw_svector_ostream OS(Name);
DepDecl->getNameForDiagnostic(OS, SemaRef.getPrintingPolicy(),
<< OS.str();
return false;
if (Data.first) {
DepDecl = VD;
BaseLoopId = Data.first;
return Data.first;
bool VisitDeclRefExpr(const DeclRefExpr *E) {
const ValueDecl *VD = E->getDecl();
if (isa<VarDecl>(VD))
return checkDecl(E, VD);
return false;
bool VisitMemberExpr(const MemberExpr *E) {
if (isa<CXXThisExpr>(E->getBase()->IgnoreParens())) {
const ValueDecl *VD = E->getMemberDecl();
if (isa<VarDecl>(VD) || isa<FieldDecl>(VD))
return checkDecl(E, VD);
return false;
bool VisitStmt(const Stmt *S) {
bool Res = false;
for (const Stmt *Child : S->children())
Res = (Child && Visit(Child)) || Res;
return Res;
explicit LoopCounterRefChecker(Sema &SemaRef, DSAStackTy &Stack,
const ValueDecl *CurLCDecl, bool IsInitializer,
const ValueDecl *PrevDepDecl = nullptr)
: SemaRef(SemaRef), Stack(Stack), CurLCDecl(CurLCDecl),
PrevDepDecl(PrevDepDecl), IsInitializer(IsInitializer) {}
unsigned getBaseLoopId() const {
assert(CurLCDecl && "Expected loop dependency.");
return BaseLoopId;
const ValueDecl *getDepDecl() const {
assert(CurLCDecl && "Expected loop dependency.");
return DepDecl;
} // namespace
OpenMPIterationSpaceChecker::doesDependOnLoopCounter(const Stmt *S,
bool IsInitializer) {
// Check for the non-rectangular loops.
LoopCounterRefChecker LoopStmtChecker(SemaRef, Stack, LCDecl, IsInitializer,
if (LoopStmtChecker.Visit(S)) {
DepDecl = LoopStmtChecker.getDepDecl();
return LoopStmtChecker.getBaseLoopId();
return llvm::None;
bool OpenMPIterationSpaceChecker::checkAndSetInit(Stmt *S, bool EmitDiags) {
// Check init-expr for canonical loop form and save loop counter
// variable - #Var and its initialization value - #LB.
// OpenMP [2.6] Canonical loop form. init-expr may be one of the following:
// var = lb
// integer-type var = lb
// random-access-iterator-type var = lb
// pointer-type var = lb
if (!S) {
if (EmitDiags) {
SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_init);
return true;
if (auto *ExprTemp = dyn_cast<ExprWithCleanups>(S))
if (!ExprTemp->cleanupsHaveSideEffects())
S = ExprTemp->getSubExpr();
InitSrcRange = S->getSourceRange();
if (Expr *E = dyn_cast<Expr>(S))
S = E->IgnoreParens();
if (auto *BO = dyn_cast<BinaryOperator>(S)) {
if (BO->getOpcode() == BO_Assign) {
Expr *LHS = BO->getLHS()->IgnoreParens();
if (auto *DRE = dyn_cast<DeclRefExpr>(LHS)) {
if (auto *CED = dyn_cast<OMPCapturedExprDecl>(DRE->getDecl()))
if (auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
return setLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS(),
return setLCDeclAndLB(DRE->getDecl(), DRE, BO->getRHS(), EmitDiags);
if (auto *ME = dyn_cast<MemberExpr>(LHS)) {
if (ME->isArrow() &&
return setLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS(),
} else if (auto *DS = dyn_cast<DeclStmt>(S)) {
if (DS->isSingleDecl()) {
if (auto *Var = dyn_cast_or_null<VarDecl>(DS->getSingleDecl())) {
if (Var->hasInit() && !Var->getType()->isReferenceType()) {
// Accept non-canonical init form here but emit ext. warning.
if (Var->getInitStyle() != VarDecl::CInit && EmitDiags)
<< S->getSourceRange();
return setLCDeclAndLB(
buildDeclRefExpr(SemaRef, Var,
Var->getInit(), EmitDiags);
} else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
if (CE->getOperator() == OO_Equal) {
Expr *LHS = CE->getArg(0);
if (auto *DRE = dyn_cast<DeclRefExpr>(LHS)) {
if (auto *CED = dyn_cast<OMPCapturedExprDecl>(DRE->getDecl()))
if (auto *ME = dyn_cast<MemberExpr>(getExprAsWritten(CED->getInit())))
return setLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS(),
return setLCDeclAndLB(DRE->getDecl(), DRE, CE->getArg(1), EmitDiags);
if (auto *ME = dyn_cast<MemberExpr>(LHS)) {
if (ME->isArrow() &&
return setLCDeclAndLB(ME->getMemberDecl(), ME, BO->getRHS(),
if (dependent() || SemaRef.CurContext->isDependentContext())
return false;
if (EmitDiags) {
SemaRef.Diag(S->getBeginLoc(), diag::err_omp_loop_not_canonical_init)
<< S->getSourceRange();
return true;
/// Ignore parenthesizes, implicit casts, copy constructor and return the
/// variable (which may be the loop variable) if possible.
static const ValueDecl *getInitLCDecl(const Expr *E) {
if (!E)
return nullptr;
E = getExprAsWritten(E);
if (const auto *CE = dyn_cast_or_null<CXXConstructExpr>(E))
if (const CXXConstructorDecl *Ctor = CE->getConstructor())
if ((Ctor->isCopyOrMoveConstructor() ||
Ctor->isConvertingConstructor(/*AllowExplicit=*/false)) &&
CE->getNumArgs() > 0 && CE->getArg(0) != nullptr)
E = CE->getArg(0)->IgnoreParenImpCasts();
if (const auto *DRE = dyn_cast_or_null<DeclRefExpr>(E)) {
if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
return getCanonicalDecl(VD);
if (const auto *ME = dyn_cast_or_null<MemberExpr>(E))
if (ME->isArrow() && isa<CXXThisExpr>(ME->getBase()->IgnoreParenImpCasts()))
return getCanonicalDecl(ME->getMemberDecl());
return nullptr;
bool OpenMPIterationSpaceChecker::checkAndSetCond(Expr *S) {
// Check test-expr for canonical form, save upper-bound UB, flags for
// less/greater and for strict/non-strict comparison.
// OpenMP [2.9] Canonical loop form. Test-expr may be one of the following:
// var relational-op b
// b relational-op var
bool IneqCondIsCanonical = SemaRef.getLangOpts().OpenMP >= 50;
if (!S) {
SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_cond)
<< (IneqCondIsCanonical ? 1 : 0) << LCDecl;
return true;
Condition = S;
S = getExprAsWritten(S);
SourceLocation CondLoc = S->getBeginLoc();
if (auto *BO = dyn_cast<BinaryOperator>(S)) {
if (BO->isRelationalOp()) {
if (getInitLCDecl(BO->getLHS()) == LCDecl)
return setUB(BO->getRHS(),
(BO->getOpcode() == BO_LT || BO->getOpcode() == BO_LE),
(BO->getOpcode() == BO_LT || BO->getOpcode() == BO_GT),
BO->getSourceRange(), BO->getOperatorLoc());
if (getInitLCDecl(BO->getRHS()) == LCDecl)
return setUB(BO->getLHS(),
(BO->getOpcode() == BO_GT || BO->getOpcode() == BO_GE),
(BO->getOpcode() == BO_LT || BO->getOpcode() == BO_GT),
BO->getSourceRange(), BO->getOperatorLoc());
} else if (IneqCondIsCanonical && BO->getOpcode() == BO_NE)
return setUB(
getInitLCDecl(BO->getLHS()) == LCDecl ? BO->getRHS() : BO->getLHS(),
/*StrictOp=*/true, BO->getSourceRange(), BO->getOperatorLoc());
} else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
if (CE->getNumArgs() == 2) {
auto Op = CE->getOperator();
switch (Op) {
case OO_Greater:
case OO_GreaterEqual:
case OO_Less:
case OO_LessEqual:
if (getInitLCDecl(CE->getArg(0)) == LCDecl)
return setUB(CE->getArg(1), Op == OO_Less || Op == OO_LessEqual,
Op == OO_Less || Op == OO_Greater, CE->getSourceRange(),
if (getInitLCDecl(CE->getArg(1)) == LCDecl)
return setUB(CE->getArg(0), Op == OO_Greater || Op == OO_GreaterEqual,
Op == OO_Less || Op == OO_Greater, CE->getSourceRange(),
case OO_ExclaimEqual:
if (IneqCondIsCanonical)
return setUB(getInitLCDecl(CE->getArg(0)) == LCDecl ? CE->getArg(1)
: CE->getArg(0),
/*StrictOp=*/true, CE->getSourceRange(),
if (dependent() || SemaRef.CurContext->isDependentContext())
return false;
SemaRef.Diag(CondLoc, diag::err_omp_loop_not_canonical_cond)
<< (IneqCondIsCanonical ? 1 : 0) << S->getSourceRange() << LCDecl;
return true;
bool OpenMPIterationSpaceChecker::checkAndSetIncRHS(Expr *RHS) {
// RHS of canonical loop form increment can be:
// var + incr
// incr + var
// var - incr
RHS = RHS->IgnoreParenImpCasts();
if (auto *BO = dyn_cast<BinaryOperator>(RHS)) {
if (BO->isAdditiveOp()) {
bool IsAdd = BO->getOpcode() == BO_Add;
if (getInitLCDecl(BO->getLHS()) == LCDecl)
return setStep(BO->getRHS(), !IsAdd);
if (IsAdd && getInitLCDecl(BO->getRHS()) == LCDecl)
return setStep(BO->getLHS(), /*Subtract=*/false);
} else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(RHS)) {
bool IsAdd = CE->getOperator() == OO_Plus;
if ((IsAdd || CE->getOperator() == OO_Minus) && CE->getNumArgs() == 2) {
if (getInitLCDecl(CE->getArg(0)) == LCDecl)
return setStep(CE->getArg(1), !IsAdd);
if (IsAdd && getInitLCDecl(CE->getArg(1)) == LCDecl)
return setStep(CE->getArg(0), /*Subtract=*/false);
if (dependent() || SemaRef.CurContext->isDependentContext())
return false;
SemaRef.Diag(RHS->getBeginLoc(), diag::err_omp_loop_not_canonical_incr)
<< RHS->getSourceRange() << LCDecl;
return true;
bool OpenMPIterationSpaceChecker::checkAndSetInc(Expr *S) {
// Check incr-expr for canonical loop form and return true if it
// does not conform.
// OpenMP [2.6] Canonical loop form. Test-expr may be one of the following:
// ++var
// var++
// --var
// var--
// var += incr
// var -= incr
// var = var + incr
// var = incr + var
// var = var - incr
if (!S) {
SemaRef.Diag(DefaultLoc, diag::err_omp_loop_not_canonical_incr) << LCDecl;
return true;
if (auto *ExprTemp = dyn_cast<ExprWithCleanups>(S))
if (!ExprTemp->cleanupsHaveSideEffects())
S = ExprTemp->getSubExpr();
IncrementSrcRange = S->getSourceRange();
S = S->IgnoreParens();
if (auto *UO = dyn_cast<UnaryOperator>(S)) {
if (UO->isIncrementDecrementOp() &&
getInitLCDecl(UO->getSubExpr()) == LCDecl)
return setStep(SemaRef
(UO->isDecrementOp() ? -1 : 1))
} else if (auto *BO = dyn_cast<BinaryOperator>(S)) {
switch (BO->getOpcode()) {
case BO_AddAssign:
case BO_SubAssign:
if (getInitLCDecl(BO->getLHS()) == LCDecl)
return setStep(BO->getRHS(), BO->getOpcode() == BO_SubAssign);
case BO_Assign:
if (getInitLCDecl(BO->getLHS()) == LCDecl)
return checkAndSetIncRHS(BO->getRHS());
} else if (auto *CE = dyn_cast<CXXOperatorCallExpr>(S)) {
switch (CE->getOperator()) {
case OO_PlusPlus:
case OO_MinusMinus:
if (getInitLCDecl(CE->getArg(0)) == LCDecl)
return setStep(SemaRef
((CE->getOperator() == OO_MinusMinus) ? -1 : 1))
case OO_PlusEqual:
case OO_MinusEqual:
if (getInitLCDecl(CE->getArg(0)) == LCDecl)
return setStep(CE->getArg(1), CE->getOperator() == OO_MinusEqual);
case OO_Equal:
if (getInitLCDecl(CE->getArg(0)) == LCDecl)
return checkAndSetIncRHS(CE->getArg(1));
if (dependent() || SemaRef.CurContext->isDependentContext())
return false;
SemaRef.Diag(S->getBeginLoc(), diag::err_omp_loop_not_canonical_incr)
<< S->getSourceRange() << LCDecl;
return true;
static ExprResult
tryBuildCapture(Sema &SemaRef, Expr *Capture,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) {
if (SemaRef.CurContext->isDependentContext() || Capture->containsErrors())
return Capture;
if (Capture->isEvaluatable(SemaRef.Context, Expr::SE_AllowSideEffects))
return SemaRef.PerformImplicitConversion(
Capture->IgnoreImpCasts(), Capture->getType(), Sema::AA_Converting,
auto I = Captures.find(Capture);
if (I != Captures.end())
return buildCapture(SemaRef, Capture, I->second);
DeclRefExpr *Ref = nullptr;
ExprResult Res = buildCapture(SemaRef, Capture, Ref);
Captures[Capture] = Ref;
return Res;
/// Calculate number of iterations, transforming to unsigned, if number of
/// iterations may be larger than the original type.
static Expr *
calculateNumIters(Sema &SemaRef, Scope *S, SourceLocation DefaultLoc,
Expr *Lower, Expr *Upper, Expr *Step, QualType LCTy,
bool TestIsStrictOp, bool RoundToStep,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) {
ExprResult NewStep = tryBuildCapture(SemaRef, Step, Captures);
if (!NewStep.isUsable())
return nullptr;
llvm::APSInt LRes, URes, SRes;
bool IsLowerConst = Lower->isIntegerConstantExpr(LRes, SemaRef.Context);
bool IsStepConst = Step->isIntegerConstantExpr(SRes, SemaRef.Context);
bool NoNeedToConvert = IsLowerConst && !RoundToStep &&
((!TestIsStrictOp && LRes.isNonNegative()) ||
(TestIsStrictOp && LRes.isStrictlyPositive()));
bool NeedToReorganize = false;
// Check if any subexpressions in Lower -Step [+ 1] lead to overflow.
if (!NoNeedToConvert && IsLowerConst &&
(TestIsStrictOp || (RoundToStep && IsStepConst))) {
NoNeedToConvert = true;
if (RoundToStep) {
unsigned BW = LRes.getBitWidth() > SRes.getBitWidth()
? LRes.getBitWidth()
: SRes.getBitWidth();
LRes = LRes.extend(BW + 1);
SRes = SRes.extend(BW + 1);
LRes -= SRes;
NoNeedToConvert = LRes.trunc(BW).extend(BW + 1) == LRes;
LRes = LRes.trunc(BW);
if (TestIsStrictOp) {
unsigned BW = LRes.getBitWidth();
LRes = LRes.extend(BW + 1);
NoNeedToConvert =
NoNeedToConvert && LRes.trunc(BW).extend(BW + 1) == LRes;
// truncate to the original bitwidth.
LRes = LRes.trunc(BW);
NeedToReorganize = NoNeedToConvert;
bool IsUpperConst = Upper->isIntegerConstantExpr(URes, SemaRef.Context);
if (NoNeedToConvert && IsLowerConst && IsUpperConst &&
(!RoundToStep || IsStepConst)) {
unsigned BW = LRes.getBitWidth() > URes.getBitWidth() ? LRes.getBitWidth()
: URes.getBitWidth();
LRes = LRes.extend(BW + 1);
URes = URes.extend(BW + 1);
URes -= LRes;
NoNeedToConvert = URes.trunc(BW).extend(BW + 1) == URes;
NeedToReorganize = NoNeedToConvert;
// If the boundaries are not constant or (Lower - Step [+ 1]) is not constant
// or less than zero (Upper - (Lower - Step [+ 1]) may overflow) - promote to
// unsigned.
if ((!NoNeedToConvert || (LRes.isNegative() && !IsUpperConst)) &&
!LCTy->isDependentType() && LCTy->isIntegerType()) {
QualType LowerTy = Lower->getType();
QualType UpperTy = Upper->getType();
uint64_t LowerSize = SemaRef.Context.getTypeSize(LowerTy);
uint64_t UpperSize = SemaRef.Context.getTypeSize(UpperTy);
if ((LowerSize <= UpperSize && UpperTy->hasSignedIntegerRepresentation()) ||
(LowerSize > UpperSize && LowerTy->hasSignedIntegerRepresentation())) {
QualType CastType = SemaRef.Context.getIntTypeForBitwidth(
LowerSize > UpperSize ? LowerSize : UpperSize, /*Signed=*/0);
Upper =
SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Upper).get(),
CastType, Sema::AA_Converting)
Lower = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Lower).get();
NewStep = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, NewStep.get());
if (!Lower || !Upper || NewStep.isInvalid())
return nullptr;
ExprResult Diff;
// If need to reorganize, then calculate the form as Upper - (Lower - Step [+
// 1]).
if (NeedToReorganize) {
Diff = Lower;
if (RoundToStep) {
// Lower - Step
Diff =
SemaRef.BuildBinOp(S, DefaultLoc, BO_Sub, Diff.get(), NewStep.get());
if (!Diff.isUsable())
return nullptr;
// Lower - Step [+ 1]
if (TestIsStrictOp)
Diff = SemaRef.BuildBinOp(
S, DefaultLoc, BO_Add, Diff.get(),
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
if (!Diff.isUsable())
return nullptr;
Diff = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Diff.get());
if (!Diff.isUsable())
return nullptr;
// Upper - (Lower - Step [+ 1]).
Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Sub, Upper, Diff.get());
if (!Diff.isUsable())
return nullptr;
} else {
Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Sub, Upper, Lower);
if (!Diff.isUsable() && LCTy->getAsCXXRecordDecl()) {
// BuildBinOp already emitted error, this one is to point user to upper
// and lower bound, and to tell what is passed to 'operator-'.
SemaRef.Diag(Upper->getBeginLoc(), diag::err_omp_loop_diff_cxx)
<< Upper->getSourceRange() << Lower->getSourceRange();
return nullptr;
if (!Diff.isUsable())
return nullptr;
// Upper - Lower [- 1]
if (TestIsStrictOp)
Diff = SemaRef.BuildBinOp(
S, DefaultLoc, BO_Sub, Diff.get(),
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
if (!Diff.isUsable())
return nullptr;
if (RoundToStep) {
// Upper - Lower [- 1] + Step
Diff =
SemaRef.BuildBinOp(S, DefaultLoc, BO_Add, Diff.get(), NewStep.get());
if (!Diff.isUsable())
return nullptr;
// Parentheses (for dumping/debugging purposes only).
Diff = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Diff.get());
if (!Diff.isUsable())
return nullptr;
// (Upper - Lower [- 1] + Step) / Step or (Upper - Lower) / Step
Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Div, Diff.get(), NewStep.get());
if (!Diff.isUsable())
return nullptr;
return Diff.get();
/// Build the expression to calculate the number of iterations.
Expr *OpenMPIterationSpaceChecker::buildNumIterations(
Scope *S, ArrayRef<LoopIterationSpace> ResultIterSpaces, bool LimitedType,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) const {
QualType VarType = LCDecl->getType().getNonReferenceType();
if (!VarType->isIntegerType() && !VarType->isPointerType() &&
return nullptr;
Expr *LBVal = LB;
Expr *UBVal = UB;
// LB = TestIsLessOp.getValue() ? min(LB(MinVal), LB(MaxVal)) :
// max(LB(MinVal), LB(MaxVal))
if (InitDependOnLC) {
const LoopIterationSpace &IS =
ResultIterSpaces[ResultIterSpaces.size() - 1 -
if (!IS.MinValue || !IS.MaxValue)
return nullptr;
// OuterVar = Min
ExprResult MinValue =
SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, IS.MinValue);
if (!MinValue.isUsable())
return nullptr;
ExprResult LBMinVal = SemaRef.BuildBinOp(S, DefaultLoc, BO_Assign,
IS.CounterVar, MinValue.get());
if (!LBMinVal.isUsable())
return nullptr;
// OuterVar = Min, LBVal
LBMinVal =
SemaRef.BuildBinOp(S, DefaultLoc, BO_Comma, LBMinVal.get(), LBVal);
if (!LBMinVal.isUsable())
return nullptr;
// (OuterVar = Min, LBVal)
LBMinVal = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, LBMinVal.get());
if (!LBMinVal.isUsable())
return nullptr;
// OuterVar = Max
ExprResult MaxValue =
SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, IS.MaxValue);
if (!MaxValue.isUsable())
return nullptr;
ExprResult LBMaxVal = SemaRef.BuildBinOp(S, DefaultLoc, BO_Assign,
IS.CounterVar, MaxValue.get());
if (!LBMaxVal.isUsable())
return nullptr;
// OuterVar = Max, LBVal
LBMaxVal =
SemaRef.BuildBinOp(S, DefaultLoc, BO_Comma, LBMaxVal.get(), LBVal);
if (!LBMaxVal.isUsable())
return nullptr;
// (OuterVar = Max, LBVal)
LBMaxVal = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, LBMaxVal.get());
if (!LBMaxVal.isUsable())
return nullptr;
Expr *LBMin = tryBuildCapture(SemaRef, LBMinVal.get(), Captures).get();
Expr *LBMax = tryBuildCapture(SemaRef, LBMaxVal.get(), Captures).get();
if (!LBMin || !LBMax)
return nullptr;
// LB(MinVal) < LB(MaxVal)
ExprResult MinLessMaxRes =
SemaRef.BuildBinOp(S, DefaultLoc, BO_LT, LBMin, LBMax);
if (!MinLessMaxRes.isUsable())
return nullptr;
Expr *MinLessMax =
tryBuildCapture(SemaRef, MinLessMaxRes.get(), Captures).get();
if (!MinLessMax)
return nullptr;
if (TestIsLessOp.getValue()) {
// LB(MinVal) < LB(MaxVal) ? LB(MinVal) : LB(MaxVal) - min(LB(MinVal),
// LB(MaxVal))
ExprResult MinLB = SemaRef.ActOnConditionalOp(DefaultLoc, DefaultLoc,
MinLessMax, LBMin, LBMax);
if (!MinLB.isUsable())
return nullptr;
LBVal = MinLB.get();
} else {
// LB(MinVal) < LB(MaxVal) ? LB(MaxVal) : LB(MinVal) - max(LB(MinVal),
// LB(MaxVal))
ExprResult MaxLB = SemaRef.ActOnConditionalOp(DefaultLoc, DefaultLoc,
MinLessMax, LBMax, LBMin);
if (!MaxLB.isUsable())
return nullptr;
LBVal = MaxLB.get();
// UB = TestIsLessOp.getValue() ? max(UB(MinVal), UB(MaxVal)) :
// min(UB(MinVal), UB(MaxVal))
if (CondDependOnLC) {
const LoopIterationSpace &IS =
ResultIterSpaces[ResultIterSpaces.size() - 1 -
if (!IS.MinValue || !IS.MaxValue)
return nullptr;
// OuterVar = Min
ExprResult MinValue =
SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, IS.MinValue);
if (!MinValue.isUsable())
return nullptr;
ExprResult UBMinVal = SemaRef.BuildBinOp(S, DefaultLoc, BO_Assign,
IS.CounterVar, MinValue.get());
if (!UBMinVal.isUsable())
return nullptr;
// OuterVar = Min, UBVal
UBMinVal =
SemaRef.BuildBinOp(S, DefaultLoc, BO_Comma, UBMinVal.get(), UBVal);
if (!UBMinVal.isUsable())
return nullptr;
// (OuterVar = Min, UBVal)
UBMinVal = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, UBMinVal.get());
if (!UBMinVal.isUsable())
return nullptr;
// OuterVar = Max
ExprResult MaxValue =
SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, IS.MaxValue);
if (!MaxValue.isUsable())
return nullptr;
ExprResult UBMaxVal = SemaRef.BuildBinOp(S, DefaultLoc, BO_Assign,
IS.CounterVar, MaxValue.get());
if (!UBMaxVal.isUsable())
return nullptr;
// OuterVar = Max, UBVal
UBMaxVal =
SemaRef.BuildBinOp(S, DefaultLoc, BO_Comma, UBMaxVal.get(), UBVal);
if (!UBMaxVal.isUsable())
return nullptr;
// (OuterVar = Max, UBVal)
UBMaxVal = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, UBMaxVal.get());
if (!UBMaxVal.isUsable())
return nullptr;
Expr *UBMin = tryBuildCapture(SemaRef, UBMinVal.get(), Captures).get();
Expr *UBMax = tryBuildCapture(SemaRef, UBMaxVal.get(), Captures).get();
if (!UBMin || !UBMax)
return nullptr;
// UB(MinVal) > UB(MaxVal)
ExprResult MinGreaterMaxRes =
SemaRef.BuildBinOp(S, DefaultLoc, BO_GT, UBMin, UBMax);
if (!MinGreaterMaxRes.isUsable())
return nullptr;
Expr *MinGreaterMax =
tryBuildCapture(SemaRef, MinGreaterMaxRes.get(), Captures).get();
if (!MinGreaterMax)
return nullptr;
if (TestIsLessOp.getValue()) {
// UB(MinVal) > UB(MaxVal) ? UB(MinVal) : UB(MaxVal) - max(UB(MinVal),
// UB(MaxVal))
ExprResult MaxUB = SemaRef.ActOnConditionalOp(
DefaultLoc, DefaultLoc, MinGreaterMax, UBMin, UBMax);
if (!MaxUB.isUsable())
return nullptr;
UBVal = MaxUB.get();
} else {
// UB(MinVal) > UB(MaxVal) ? UB(MaxVal) : UB(MinVal) - min(UB(MinVal),
// UB(MaxVal))
ExprResult MinUB = SemaRef.ActOnConditionalOp(
DefaultLoc, DefaultLoc, MinGreaterMax, UBMax, UBMin);
if (!MinUB.isUsable())
return nullptr;
UBVal = MinUB.get();
Expr *UBExpr = TestIsLessOp.getValue() ? UBVal : LBVal;
Expr *LBExpr = TestIsLessOp.getValue() ? LBVal : UBVal;
Expr *Upper = tryBuildCapture(SemaRef, UBExpr, Captures).get();
Expr *Lower = tryBuildCapture(SemaRef, LBExpr, Captures).get();
if (!Upper || !Lower)
return nullptr;
ExprResult Diff =
calculateNumIters(SemaRef, S, DefaultLoc, Lower, Upper, Step, VarType,
TestIsStrictOp, /*RoundToStep=*/true, Captures);
if (!Diff.isUsable())
return nullptr;
// OpenMP runtime requires 32-bit or 64-bit loop variables.
QualType Type = Diff.get()->getType();
ASTContext &C = SemaRef.Context;
bool UseVarType = VarType->hasIntegerRepresentation() &&
C.getTypeSize(Type) > C.getTypeSize(VarType);
if (!Type->isIntegerType() || UseVarType) {
unsigned NewSize =
UseVarType ? C.getTypeSize(VarType) : C.getTypeSize(Type);
bool IsSigned = UseVarType ? VarType->hasSignedIntegerRepresentation()
: Type->hasSignedIntegerRepresentation();
Type = C.getIntTypeForBitwidth(NewSize, IsSigned);
if (!SemaRef.Context.hasSameType(Diff.get()->getType(), Type)) {
Diff = SemaRef.PerformImplicitConversion(
Diff.get(), Type, Sema::AA_Converting, /*AllowExplicit=*/true);
if (!Diff.isUsable())
return nullptr;
if (LimitedType) {
unsigned NewSize = (C.getTypeSize(Type) > 32) ? 64 : 32;
if (NewSize != C.getTypeSize(Type)) {
if (NewSize < C.getTypeSize(Type)) {
assert(NewSize == 64 && "incorrect loop var size");
SemaRef.Diag(DefaultLoc, diag::warn_omp_loop_64_bit_var)
<< InitSrcRange << ConditionSrcRange;
QualType NewType = C.getIntTypeForBitwidth(
NewSize, Type->hasSignedIntegerRepresentation() ||
C.getTypeSize(Type) < NewSize);
if (!SemaRef.Context.hasSameType(Diff.get()->getType(), NewType)) {
Diff = SemaRef.PerformImplicitConversion(Diff.get(), NewType,
Sema::AA_Converting, true);
if (!Diff.isUsable())
return nullptr;
return Diff.get();
std::pair<Expr *, Expr *> OpenMPIterationSpaceChecker::buildMinMaxValues(
Scope *S, llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) const {
// Do not build for iterators, they cannot be used in non-rectangular loop
// nests.
if (LCDecl->getType()->isRecordType())
return std::make_pair(nullptr, nullptr);
// If we subtract, the min is in the condition, otherwise the min is in the
// init value.
Expr *MinExpr = nullptr;
Expr *MaxExpr = nullptr;
Expr *LBExpr = TestIsLessOp.getValue() ? LB : UB;
Expr *UBExpr = TestIsLessOp.getValue() ? UB : LB;
bool LBNonRect = TestIsLessOp.getValue() ? InitDependOnLC.hasValue()
: CondDependOnLC.hasValue();
bool UBNonRect = TestIsLessOp.getValue() ? CondDependOnLC.hasValue()
: InitDependOnLC.hasValue();
Expr *Lower =
LBNonRect ? LBExpr : tryBuildCapture(SemaRef, LBExpr, Captures).get();
Expr *Upper =
UBNonRect ? UBExpr : tryBuildCapture(SemaRef, UBExpr, Captures).get();
if (!Upper || !Lower)
return std::make_pair(nullptr, nullptr);
if (TestIsLessOp.getValue())
MinExpr = Lower;
MaxExpr = Upper;
// Build minimum/maximum value based on number of iterations.
QualType VarType = LCDecl->getType().getNonReferenceType();
ExprResult Diff =
calculateNumIters(SemaRef, S, DefaultLoc, Lower, Upper, Step, VarType,
TestIsStrictOp, /*RoundToStep=*/false, Captures);
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
// ((Upper - Lower [- 1]) / Step) * Step
// Parentheses (for dumping/debugging purposes only).
Diff = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Diff.get());
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
ExprResult NewStep = tryBuildCapture(SemaRef, Step, Captures);
if (!NewStep.isUsable())
return std::make_pair(nullptr, nullptr);
Diff = SemaRef.BuildBinOp(S, DefaultLoc, BO_Mul, Diff.get(), NewStep.get());
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
// Parentheses (for dumping/debugging purposes only).
Diff = SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Diff.get());
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
// Convert to the ptrdiff_t, if original type is pointer.
if (VarType->isAnyPointerType() &&
SemaRef.Context.getUnsignedPointerDiffType())) {
Diff = SemaRef.PerformImplicitConversion(
Diff.get(), SemaRef.Context.getUnsignedPointerDiffType(),
Sema::AA_Converting, /*AllowExplicit=*/true);
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
if (TestIsLessOp.getValue()) {
// MinExpr = Lower;
// MaxExpr = Lower + (((Upper - Lower [- 1]) / Step) * Step)
Diff = SemaRef.BuildBinOp(
S, DefaultLoc, BO_Add,
SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Lower).get(),
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
} else {
// MaxExpr = Upper;
// MinExpr = Upper - (((Upper - Lower [- 1]) / Step) * Step)
Diff = SemaRef.BuildBinOp(
S, DefaultLoc, BO_Sub,
SemaRef.ActOnParenExpr(DefaultLoc, DefaultLoc, Upper).get(),
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
// Convert to the original type.
if (SemaRef.Context.hasSameType(Diff.get()->getType(), VarType))
Diff = SemaRef.PerformImplicitConversion(Diff.get(), VarType,
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
Diff = SemaRef.ActOnFinishFullExpr(Diff.get(), /*DiscardedValue=*/false);
if (!Diff.isUsable())
return std::make_pair(nullptr, nullptr);
if (TestIsLessOp.getValue())
MaxExpr = Diff.get();
MinExpr = Diff.get();
return std::make_pair(MinExpr, MaxExpr);
Expr *OpenMPIterationSpaceChecker::buildFinalCondition(Scope *S) const {
if (InitDependOnLC || CondDependOnLC)
return Condition;
return nullptr;
Expr *OpenMPIterationSpaceChecker::buildPreCond(
Scope *S, Expr *Cond,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) const {
// Do not build a precondition when the condition/initialization is dependent
// to prevent pessimistic early loop exit.
// TODO: this can be improved by calculating min/max values but not sure that
// it will be very effective.
if (CondDependOnLC || InitDependOnLC)
return SemaRef.PerformImplicitConversion(
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get(),
SemaRef.Context.BoolTy, /*Action=*/Sema::AA_Casting,
// Try to build LB <op> UB, where <op> is <, >, <=, or >=.
Sema::TentativeAnalysisScope Trap(SemaRef);
ExprResult NewLB = tryBuildCapture(SemaRef, LB, Captures);
ExprResult NewUB = tryBuildCapture(SemaRef, UB, Captures);
if (!NewLB.isUsable() || !NewUB.isUsable())
return nullptr;
ExprResult CondExpr =
SemaRef.BuildBinOp(S, DefaultLoc,
TestIsLessOp.getValue() ?
(TestIsStrictOp ? BO_LT : BO_LE) :
(TestIsStrictOp ? BO_GT : BO_GE),
NewLB.get(), NewUB.get());
if (CondExpr.isUsable()) {
if (!SemaRef.Context.hasSameUnqualifiedType(CondExpr.get()->getType(),
CondExpr = SemaRef.PerformImplicitConversion(
CondExpr.get(), SemaRef.Context.BoolTy, /*Action=*/Sema::AA_Casting,
// Otherwise use original loop condition and evaluate it in runtime.
return CondExpr.isUsable() ? CondExpr.get() : Cond;
/// Build reference expression to the counter be used for codegen.
DeclRefExpr *OpenMPIterationSpaceChecker::buildCounterVar(
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures,
DSAStackTy &DSA) const {
auto *VD = dyn_cast<VarDecl>(LCDecl);
if (!VD) {
VD = SemaRef.isOpenMPCapturedDecl(LCDecl);
DeclRefExpr *Ref = buildDeclRefExpr(
SemaRef, VD, VD->getType().getNonReferenceType(), DefaultLoc);
const DSAStackTy::DSAVarData Data =
DSA.getTopDSA(LCDecl, /*FromParent=*/false);
// If the loop control decl is explicitly marked as private, do not mark it
// as captured again.
if (!isOpenMPPrivate(Data.CKind) || !Data.RefExpr)
Captures.insert(std::make_pair(LCRef, Ref));
return Ref;
return cast<DeclRefExpr>(LCRef);
Expr *OpenMPIterationSpaceChecker::buildPrivateCounterVar() const {
if (LCDecl && !LCDecl->isInvalidDecl()) {
QualType Type = LCDecl->getType().getNonReferenceType();
VarDecl *PrivateVar = buildVarDecl(
SemaRef, DefaultLoc, Type, LCDecl->getName(),
LCDecl->hasAttrs() ? &LCDecl->getAttrs() : nullptr,
? buildDeclRefExpr(SemaRef, cast<VarDecl>(LCDecl), Type, DefaultLoc)
: nullptr);
if (PrivateVar->isInvalidDecl())
return nullptr;
return buildDeclRefExpr(SemaRef, PrivateVar, Type, DefaultLoc);
return nullptr;
/// Build initialization of the counter to be used for codegen.
Expr *OpenMPIterationSpaceChecker::buildCounterInit() const { return LB; }
/// Build step of the counter be used for codegen.
Expr *OpenMPIterationSpaceChecker::buildCounterStep() const { return Step; }
Expr *OpenMPIterationSpaceChecker::buildOrderedLoopData(
Scope *S, Expr *Counter,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures, SourceLocation Loc,
Expr *Inc, OverloadedOperatorKind OOK) {
Expr *Cnt = SemaRef.DefaultLvalueConversion(Counter).get();
if (!Cnt)
return nullptr;
if (Inc) {
assert((OOK == OO_Plus || OOK == OO_Minus) &&
"Expected only + or - operations for depend clauses.");
BinaryOperatorKind BOK = (OOK == OO_Plus) ? BO_Add : BO_Sub;
Cnt = SemaRef.BuildBinOp(S, Loc, BOK, Cnt, Inc).get();
if (!Cnt)
return nullptr;
QualType VarType = LCDecl->getType().getNonReferenceType();
if (!VarType->isIntegerType() && !VarType->isPointerType() &&
return nullptr;
// Upper - Lower
Expr *Upper = TestIsLessOp.getValue()
? Cnt
: tryBuildCapture(SemaRef, LB, Captures).get();
Expr *Lower = TestIsLessOp.getValue()
? tryBuildCapture(SemaRef, LB, Captures).get()
: Cnt;
if (!Upper || !Lower)
return nullptr;
ExprResult Diff = calculateNumIters(SemaRef, S, DefaultLoc, Lower, Upper,
Step, VarType, /*TestIsStrictOp=*/false,
/*RoundToStep=*/false, Captures);
if (!Diff.isUsable())
return nullptr;
return Diff.get();
} // namespace
void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) {
assert(getLangOpts().OpenMP && "OpenMP is not active.");
assert(Init && "Expected loop in canonical form.");
unsigned AssociatedLoops = DSAStack->getAssociatedLoops();
if (AssociatedLoops > 0 &&
isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
OpenMPIterationSpaceChecker ISC(*this, *DSAStack, ForLoc);
if (!ISC.checkAndSetInit(Init, /*EmitDiags=*/false)) {
if (ValueDecl *D = ISC.getLoopDecl()) {
auto *VD = dyn_cast<VarDecl>(D);
DeclRefExpr *PrivateRef = nullptr;
if (!VD) {
if (VarDecl *Private = isOpenMPCapturedDecl(D)) {
VD = Private;
} else {
PrivateRef = buildCapture(*this, D, ISC.getLoopDeclRefExpr(),
VD = cast<VarDecl>(PrivateRef->getDecl());
DSAStack->addLoopControlVariable(D, VD);
const Decl *LD = DSAStack->getPossiblyLoopCunter();
if (LD != D->getCanonicalDecl()) {
if (auto *Var = dyn_cast_or_null<VarDecl>(LD))
buildDeclRefExpr(*this, const_cast<VarDecl *>(Var),
ForLoc, /*RefersToCapture=*/true));
OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
// OpenMP [, Data-sharing Attribute Rules for Variables
// Referenced in a Construct, C/C++]. The loop iteration variable in the
// associated for-loop of a simd construct with just one associated
// for-loop may be listed in a linear clause with a constant-linear-step
// that is the increment of the associated for-loop. The loop iteration
// variable(s) in the associated for-loop(s) of a for or parallel for
// construct may be listed in a private or lastprivate clause.
DSAStackTy::DSAVarData DVar =
DSAStack->getTopDSA(D, /*FromParent=*/false);
// If LoopVarRefExpr is nullptr it means the corresponding loop variable
// is declared in the loop and it is predetermined as a private.
Expr *LoopDeclRefExpr = ISC.getLoopDeclRefExpr();
OpenMPClauseKind PredeterminedCKind =
? (DSAStack->hasMutipleLoops() ? OMPC_lastprivate : OMPC_linear)
: OMPC_private;
if (((isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
DVar.CKind != PredeterminedCKind && DVar.RefExpr &&
(LangOpts.OpenMP <= 45 || (DVar.CKind != OMPC_lastprivate &&
DVar.CKind != OMPC_private))) ||
((isOpenMPWorksharingDirective(DKind) || DKind == OMPD_taskloop ||
DKind == OMPD_master_taskloop ||
DKind == OMPD_parallel_master_taskloop ||
isOpenMPDistributeDirective(DKind)) &&
!isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
DVar.CKind != OMPC_private && DVar.CKind != OMPC_lastprivate)) &&
(DVar.CKind != OMPC_private || DVar.RefExpr)) {
Diag(Init->getBeginLoc(), diag::err_omp_loop_var_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPDirectiveName(DKind)
<< getOpenMPClauseName(PredeterminedCKind);
if (DVar.RefExpr == nullptr)
DVar.CKind = PredeterminedCKind;
reportOriginalDsa(*this, DSAStack, D, DVar,
} else if (LoopDeclRefExpr) {
// Make the loop iteration variable private (for worksharing
// constructs), linear (for simd directives with the only one
// associated loop) or lastprivate (for simd directives with several
// collapsed or ordered loops).
if (DVar.CKind == OMPC_unknown)
DSAStack->addDSA(D, LoopDeclRefExpr, PredeterminedCKind,
DSAStack->setAssociatedLoops(AssociatedLoops - 1);
/// Called on a for stmt to check and extract its iteration space
/// for further processing (such as collapsing).
static bool checkOpenMPIterationSpace(
OpenMPDirectiveKind DKind, Stmt *S, Sema &SemaRef, DSAStackTy &DSA,
unsigned CurrentNestedLoopCount, unsigned NestedLoopCount,
unsigned TotalNestedLoopCount, Expr *CollapseLoopCountExpr,
Expr *OrderedLoopCountExpr,
Sema::VarsWithInheritedDSAType &VarsWithImplicitDSA,
llvm::MutableArrayRef<LoopIterationSpace> ResultIterSpaces,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) {
// OpenMP [2.9.1, Canonical Loop Form]
// for (init-expr; test-expr; incr-expr) structured-block
// for (range-decl: range-expr) structured-block
auto *For = dyn_cast_or_null<ForStmt>(S);
auto *CXXFor = dyn_cast_or_null<CXXForRangeStmt>(S);
// Ranged for is supported only in OpenMP 5.0.
if (!For && (SemaRef.LangOpts.OpenMP <= 45 || !CXXFor)) {
SemaRef.Diag(S->getBeginLoc(), diag::err_omp_not_for)
<< (CollapseLoopCountExpr != nullptr || OrderedLoopCountExpr != nullptr)
<< getOpenMPDirectiveName(DKind) << TotalNestedLoopCount
<< (CurrentNestedLoopCount > 0) << CurrentNestedLoopCount;
if (TotalNestedLoopCount > 1) {
if (CollapseLoopCountExpr && OrderedLoopCountExpr)
<< 2 << CollapseLoopCountExpr->getSourceRange()
<< OrderedLoopCountExpr->getSourceRange();
else if (CollapseLoopCountExpr)
<< 0 << CollapseLoopCountExpr->getSourceRange();
<< 1 << OrderedLoopCountExpr->getSourceRange();
return true;
assert(((For && For->getBody()) || (CXXFor && CXXFor->getBody())) &&
"No loop body.");
OpenMPIterationSpaceChecker ISC(SemaRef, DSA,
For ? For->getForLoc() : CXXFor->getForLoc());
// Check init.
Stmt *Init = For ? For->getInit() : CXXFor->getBeginStmt();
if (ISC.checkAndSetInit(Init))
return true;
bool HasErrors = false;
// Check loop variable's type.
if (ValueDecl *LCDecl = ISC.getLoopDecl()) {
// OpenMP [2.6, Canonical Loop Form]
// Var is one of the following:
// A variable of signed or unsigned integer type.
// For C++, a variable of a random access iterator type.
// For C, a variable of a pointer type.
QualType VarType = LCDecl->getType().getNonReferenceType();
if (!VarType->isDependentType() && !VarType->isIntegerType() &&
!VarType->isPointerType() &&
!(SemaRef.getLangOpts().CPlusPlus && VarType->isOverloadableType())) {
SemaRef.Diag(Init->getBeginLoc(), diag::err_omp_loop_variable_type)
<< SemaRef.getLangOpts().CPlusPlus;
HasErrors = true;
// OpenMP, Data-sharing Attribute Rules for Variables Referenced in
// a Construct
// The loop iteration variable(s) in the associated for-loop(s) of a for or
// parallel for construct is (are) private.
// The loop iteration variable in the associated for-loop of a simd
// construct with just one associated for-loop is linear with a
// constant-linear-step that is the increment of the associated for-loop.
// Exclude loop var from the list of variables with implicitly defined data
// sharing attributes.
assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars");
// Check test-expr.
HasErrors |= ISC.checkAndSetCond(For ? For->getCond() : CXXFor->getCond());
// Check incr-expr.
HasErrors |= ISC.checkAndSetInc(For ? For->getInc() : CXXFor->getInc());
if (ISC.dependent() || SemaRef.CurContext->isDependentContext() || HasErrors)
return HasErrors;
// Build the loop's iteration space representation.
ResultIterSpaces[CurrentNestedLoopCount].PreCond = ISC.buildPreCond(
DSA.getCurScope(), For ? For->getCond() : CXXFor->getCond(), Captures);
ResultIterSpaces[CurrentNestedLoopCount].NumIterations =
ISC.buildNumIterations(DSA.getCurScope(), ResultIterSpaces,
(isOpenMPWorksharingDirective(DKind) ||
isOpenMPTaskLoopDirective(DKind) ||
ResultIterSpaces[CurrentNestedLoopCount].CounterVar =
ISC.buildCounterVar(Captures, DSA);
ResultIterSpaces[CurrentNestedLoopCount].PrivateCounterVar =
ResultIterSpaces[CurrentNestedLoopCount].CounterInit = ISC.buildCounterInit();
ResultIterSpaces[CurrentNestedLoopCount].CounterStep = ISC.buildCounterStep();
ResultIterSpaces[CurrentNestedLoopCount].InitSrcRange = ISC.getInitSrcRange();
ResultIterSpaces[CurrentNestedLoopCount].CondSrcRange =
ResultIterSpaces[CurrentNestedLoopCount].IncSrcRange =
ResultIterSpaces[CurrentNestedLoopCount].Subtract = ISC.shouldSubtractStep();
ResultIterSpaces[CurrentNestedLoopCount].IsStrictCompare =
ResultIterSpaces[CurrentNestedLoopCount].MaxValue) =
ISC.buildMinMaxValues(DSA.getCurScope(), Captures);
ResultIterSpaces[CurrentNestedLoopCount].FinalCondition =
ResultIterSpaces[CurrentNestedLoopCount].IsNonRectangularLB =
ResultIterSpaces[CurrentNestedLoopCount].IsNonRectangularUB =
ResultIterSpaces[CurrentNestedLoopCount].LoopDependentIdx =
HasErrors |=
(ResultIterSpaces[CurrentNestedLoopCount].PreCond == nullptr ||
ResultIterSpaces[CurrentNestedLoopCount].NumIterations == nullptr ||
ResultIterSpaces[CurrentNestedLoopCount].CounterVar == nullptr ||
ResultIterSpaces[CurrentNestedLoopCount].PrivateCounterVar == nullptr ||
ResultIterSpaces[CurrentNestedLoopCount].CounterInit == nullptr ||
ResultIterSpaces[CurrentNestedLoopCount].CounterStep == nullptr);
if (!HasErrors && DSA.isOrderedRegion()) {
if (DSA.getOrderedRegionParam().second->getNumForLoops()) {
if (CurrentNestedLoopCount <
DSA.getOrderedRegionParam().second->getLoopNumIterations().size()) {
for (auto &Pair : DSA.getDoacrossDependClauses()) {
if (CurrentNestedLoopCount >= Pair.first->getNumLoops()) {
// Erroneous case - clause has some problems.
if (Pair.first->getDependencyKind() == OMPC_DEPEND_sink &&
Pair.second.size() <= CurrentNestedLoopCount) {
// Erroneous case - clause has some problems.
Pair.first->setLoopData(CurrentNestedLoopCount, nullptr);
Expr *CntValue;
if (Pair.first->getDependencyKind() == OMPC_DEPEND_source)
CntValue = ISC.buildOrderedLoopData(
ResultIterSpaces[CurrentNestedLoopCount].CounterVar, Captures,
CntValue = ISC.buildOrderedLoopData(
ResultIterSpaces[CurrentNestedLoopCount].CounterVar, Captures,
Pair.first->setLoopData(CurrentNestedLoopCount, CntValue);
return HasErrors;
/// Build 'VarRef = Start.
static ExprResult
buildCounterInit(Sema &SemaRef, Scope *S, SourceLocation Loc, ExprResult VarRef,
ExprResult Start, bool IsNonRectangularLB,
llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) {
// Build 'VarRef = Start.
ExprResult NewStart = IsNonRectangularLB
? Start.get()
: tryBuildCapture(SemaRef, Start.get(), Captures);
if (!NewStart.isUsable())
return ExprError();
if (!SemaRef.Context.hasSameType(NewStart.get()->getType(),
VarRef.get()->getType())) {
NewStart = SemaRef.PerformImplicitConversion(
NewStart.get(), VarRef.get()->getType(), Sema::AA_Converting,
if (!NewStart.isUsable())
return ExprError();
ExprResult Init =
SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), NewStart.get());
return Init;
/// Build 'VarRef = Start + Iter * Step'.
static ExprResult buildCounterUpdate(
Sema &SemaRef, Scope *S, SourceLocation Loc, ExprResult VarRef,
ExprResult Start, ExprResult Iter, ExprResult Step, bool Subtract,
bool IsNonRectangularLB,
llvm::MapVector<const Expr *, DeclRefExpr *> *Captures = nullptr) {
// Add parentheses (for debugging purposes only).
Iter = SemaRef.ActOnParenExpr(Loc, Loc, Iter.get());
if (!VarRef.isUsable() || !Start.isUsable() || !Iter.isUsable() ||
return ExprError();
ExprResult NewStep = Step;
if (Captures)
NewStep = tryBuildCapture(SemaRef, Step.get(), *Captures);
if (NewStep.isInvalid())
return ExprError();
ExprResult Update =
SemaRef.BuildBinOp(S, Loc, BO_Mul, Iter.get(), NewStep.get());
if (!Update.isUsable())
return ExprError();
// Try to build 'VarRef = Start, VarRef (+|-)= Iter * Step' or
// 'VarRef = Start (+|-) Iter * Step'.
if (!Start.isUsable())
return ExprError();
ExprResult NewStart = SemaRef.ActOnParenExpr(Loc, Loc, Start.get());
if (!NewStart.isUsable())
return ExprError();
if (Captures && !IsNonRectangularLB)
NewStart = tryBuildCapture(SemaRef, Start.get(), *Captures);
if (NewStart.isInvalid())
return ExprError();
// First attempt: try to build 'VarRef = Start, VarRef += Iter * Step'.
ExprResult SavedUpdate = Update;
ExprResult UpdateVal;
if (VarRef.get()->getType()->isOverloadableType() ||
NewStart.get()->getType()->isOverloadableType() ||
Update.get()->getType()->isOverloadableType()) {
Sema::TentativeAnalysisScope Trap(SemaRef);
Update =
SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), NewStart.get());
if (Update.isUsable()) {
UpdateVal =
SemaRef.BuildBinOp(S, Loc, Subtract ? BO_SubAssign : BO_AddAssign,
VarRef.get(), SavedUpdate.get());
if (UpdateVal.isUsable()) {
Update = SemaRef.CreateBuiltinBinOp(Loc, BO_Comma, Update.get(),
// Second attempt: try to build 'VarRef = Start (+|-) Iter * Step'.
if (!Update.isUsable() || !UpdateVal.isUsable()) {
Update = SemaRef.BuildBinOp(S, Loc, Subtract ? BO_Sub : BO_Add,
NewStart.get(), SavedUpdate.get());
if (!Update.isUsable())
return ExprError();
if (!SemaRef.Context.hasSameType(Update.get()->getType(),
VarRef.get()->getType())) {
Update = SemaRef.PerformImplicitConversion(
Update.get(), VarRef.get()->getType(), Sema::AA_Converting, true);
if (!Update.isUsable())
return ExprError();
Update = SemaRef.BuildBinOp(S, Loc, BO_Assign, VarRef.get(), Update.get());
return Update;
/// Convert integer expression \a E to make it have at least \a Bits
/// bits.
static ExprResult widenIterationCount(unsigned Bits, Expr *E, Sema &SemaRef) {
if (E == nullptr)
return ExprError();
ASTContext &C = SemaRef.Context;
QualType OldType = E->getType();
unsigned HasBits = C.getTypeSize(OldType);
if (HasBits >= Bits)
return ExprResult(E);
// OK to convert to signed, because new type has more bits than old.
QualType NewType = C.getIntTypeForBitwidth(Bits, /* Signed */ true);
return SemaRef.PerformImplicitConversion(E, NewType, Sema::AA_Converting,
/// Check if the given expression \a E is a constant integer that fits
/// into \a Bits bits.
static bool fitsInto(unsigned Bits, bool Signed, const Expr *E, Sema &SemaRef) {
if (E == nullptr)
return false;
llvm::APSInt Result;
if (E->isIntegerConstantExpr(Result, SemaRef.Context))
return Signed ? Result.isSignedIntN(Bits) : Result.isIntN(Bits);
return false;
/// Build preinits statement for the given declarations.
static Stmt *buildPreInits(ASTContext &Context,
MutableArrayRef<Decl *> PreInits) {
if (!PreInits.empty()) {
return new (Context) DeclStmt(
DeclGroupRef::Create(Context, PreInits.begin(), PreInits.size()),
SourceLocation(), SourceLocation());
return nullptr;
/// Build preinits statement for the given declarations.
static Stmt *
buildPreInits(ASTContext &Context,
const llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) {
if (!Captures.empty()) {
SmallVector<Decl *, 16> PreInits;
for (const auto &Pair : Captures)
return buildPreInits(Context, PreInits);
return nullptr;
/// Build postupdate expression for the given list of postupdates expressions.
static Expr *buildPostUpdate(Sema &S, ArrayRef<Expr *> PostUpdates) {
Expr *PostUpdate = nullptr;
if (!PostUpdates.empty()) {
for (Expr *E : PostUpdates) {
Expr *ConvE = S.BuildCStyleCastExpr(
E->getExprLoc(), E)
PostUpdate = PostUpdate
? S.CreateBuiltinBinOp(ConvE->getExprLoc(), BO_Comma,
PostUpdate, ConvE)
: ConvE;
return PostUpdate;
/// Called on a for stmt to check itself and nested loops (if any).
/// \return Returns 0 if one of the collapsed stmts is not canonical for loop,
/// number of collapsed loops otherwise.
static unsigned
checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
Expr *OrderedLoopCountExpr, Stmt *AStmt, Sema &SemaRef,
DSAStackTy &DSA,
Sema::VarsWithInheritedDSAType &VarsWithImplicitDSA,
OMPLoopDirective::HelperExprs &Built) {
unsigned NestedLoopCount = 1;
if (CollapseLoopCountExpr) {
// Found 'collapse' clause - calculate collapse number.
Expr::EvalResult Result;
if (!CollapseLoopCountExpr->isValueDependent() &&
CollapseLoopCountExpr->EvaluateAsInt(Result, SemaRef.getASTContext())) {
NestedLoopCount = Result.Val.getInt().getLimitedValue();
} else {
return 1;
unsigned OrderedLoopCount = 1;
if (OrderedLoopCountExpr) {
// Found 'ordered' clause - calculate collapse number.
Expr::EvalResult EVResult;
if (!OrderedLoopCountExpr->isValueDependent() &&
SemaRef.getASTContext())) {
llvm::APSInt Result = EVResult.Val.getInt();
if (Result.getLimitedValue() < NestedLoopCount) {
<< OrderedLoopCountExpr->getSourceRange();
<< CollapseLoopCountExpr->getSourceRange();
OrderedLoopCount = Result.getLimitedValue();
} else {
return 1;
// This is helper routine for loop directives (e.g., 'for', 'simd',
// 'for simd', etc.).
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
SmallVector<LoopIterationSpace, 4> IterSpaces(
std::max(OrderedLoopCount, NestedLoopCount));
Stmt *CurStmt = AStmt->IgnoreContainers(/* IgnoreCaptured */ true);
for (unsigned Cnt = 0; Cnt < NestedLoopCount; ++Cnt) {
if (checkOpenMPIterationSpace(
DKind, CurStmt, SemaRef, DSA, Cnt, NestedLoopCount,
std::max(OrderedLoopCount, NestedLoopCount), CollapseLoopCountExpr,
OrderedLoopCountExpr, VarsWithImplicitDSA, IterSpaces, Captures))
return 0;
// Move on to the next nested for loop, or to the loop body.
// OpenMP [2.8.1, simd construct, Restrictions]
// All loops associated with the construct must be perfectly nested; that
// is, there must be no intervening code nor any OpenMP directive between
// any two loops.
if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
CurStmt = For->getBody();
} else {
assert(isa<CXXForRangeStmt>(CurStmt) &&
"Expected canonical for or range-based for loops.");
CurStmt = cast<CXXForRangeStmt>(CurStmt)->getBody();
CurStmt = OMPLoopDirective::tryToFindNextInnerLoop(
CurStmt, SemaRef.LangOpts.OpenMP >= 50);
for (unsigned Cnt = NestedLoopCount; Cnt < OrderedLoopCount; ++Cnt) {
if (checkOpenMPIterationSpace(
DKind, CurStmt, SemaRef, DSA, Cnt, NestedLoopCount,
std::max(OrderedLoopCount, NestedLoopCount), CollapseLoopCountExpr,
OrderedLoopCountExpr, VarsWithImplicitDSA, IterSpaces, Captures))
return 0;
if (Cnt > 0 && IterSpaces[Cnt].CounterVar) {
// Handle initialization of captured loop iterator variables.
auto *DRE = cast<DeclRefExpr>(IterSpaces[Cnt].CounterVar);
if (isa<OMPCapturedExprDecl>(DRE->getDecl())) {
Captures[DRE] = DRE;
// Move on to the next nested for loop, or to the loop body.
// OpenMP [2.8.1, simd construct, Restrictions]
// All loops associated with the construct must be perfectly nested; that
// is, there must be no intervening code nor any OpenMP directive between
// any two loops.
if (auto *For = dyn_cast<ForStmt>(CurStmt)) {
CurStmt = For->getBody();
} else {
assert(isa<CXXForRangeStmt>(CurStmt) &&
"Expected canonical for or range-based for loops.");
CurStmt = cast<CXXForRangeStmt>(CurStmt)->getBody();
CurStmt = OMPLoopDirective::tryToFindNextInnerLoop(
CurStmt, SemaRef.LangOpts.OpenMP >= 50);
Built.clear(/* size */ NestedLoopCount);
if (SemaRef.CurContext->isDependentContext())
return NestedLoopCount;
// An example of what is generated for the following code:
// #pragma omp simd collapse(2) ordered(2)
// for (i = 0; i < NI; ++i)
// for (k = 0; k < NK; ++k)
// for (j = J0; j < NJ; j+=2) {
// <loop body>
// }
// We generate the code below.
// Note: the loop body may be outlined in CodeGen.
// Note: some counters may be C++ classes, operator- is used to find number of
// iterations and operator+= to calculate counter value.
// Note: decltype(NumIterations) must be integer type (in 'omp for', only i32
// or i64 is currently supported).
// #define NumIterations (NI * ((NJ - J0 - 1 + 2) / 2))
// for (int[32|64]_t IV = 0; IV < NumIterations; ++IV ) {
// .local.i = IV / ((NJ - J0 - 1 + 2) / 2);
// .local.j = J0 + (IV % ((NJ - J0 - 1 + 2) / 2)) * 2;
// // similar updates for vars in clauses (e.g. 'linear')
// <loop body (using local i and j)>
// }
// i = NI; // assign final values of counters
// j = NJ;
// Last iteration number is (I1 * I2 * ... In) - 1, where I1, I2 ... In are
// the iteration counts of the collapsed for loops.
// Precondition tests if there is at least one iteration (all conditions are
// true).
auto PreCond = ExprResult(IterSpaces[0].PreCond);
Expr *N0 = IterSpaces[0].NumIterations;
ExprResult LastIteration32 =
N0->IgnoreImpCasts(), N0->getType(),
Sema::AA_Converting, /*AllowExplicit=*/true)
ExprResult LastIteration64 = widenIterationCount(
.PerformImplicitConversion(N0->IgnoreImpCasts(), N0->getType(),
if (!LastIteration32.isUsable() || !LastIteration64.isUsable())
return NestedLoopCount;
ASTContext &C = SemaRef.Context;
bool AllCountsNeedLessThan32Bits = C.getTypeSize(N0->getType()) < 32;
Scope *CurScope = DSA.getCurScope();
for (unsigned Cnt = 1; Cnt < NestedLoopCount; ++Cnt) {
if (PreCond.isUsable()) {
PreCond =
SemaRef.BuildBinOp(CurScope, PreCond.get()->getExprLoc(), BO_LAnd,
PreCond.get(), IterSpaces[Cnt].PreCond);
Expr *N = IterSpaces[Cnt].NumIterations;
SourceLocation Loc = N->getExprLoc();
AllCountsNeedLessThan32Bits &= C.getTypeSize(N->getType()) < 32;
if (LastIteration32.isUsable())
LastIteration32 = SemaRef.BuildBinOp(
CurScope, Loc, BO_Mul, LastIteration32.get(),
.PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
if (LastIteration64.isUsable())
LastIteration64 = SemaRef.BuildBinOp(
CurScope, Loc, BO_Mul, LastIteration64.get(),
.PerformImplicitConversion(N->IgnoreImpCasts(), N->getType(),
// Choose either the 32-bit or 64-bit version.
ExprResult LastIteration = LastIteration64;
if (SemaRef.getLangOpts().OpenMPOptimisticCollapse ||
(LastIteration32.isUsable() &&
C.getTypeSize(LastIteration32.get()->getType()) == 32 &&
(AllCountsNeedLessThan32Bits || NestedLoopCount == 1 ||
LastIteration64.get(), SemaRef))))
LastIteration = LastIteration32;
QualType VType = LastIteration.get()->getType();
QualType RealVType = VType;
QualType StrideVType = VType;
if (isOpenMPTaskLoopDirective(DKind)) {
VType =
SemaRef.Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
StrideVType =
SemaRef.Context.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
if (!LastIteration.isUsable())
return 0;
// Save the number of iterations.
ExprResult NumIterations = LastIteration;
LastIteration = SemaRef.BuildBinOp(
CurScope, LastIteration.get()->getExprLoc(), BO_Sub,
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
if (!LastIteration.isUsable())
return 0;
// Calculate the last iteration number beforehand instead of doing this on
// each iteration. Do not do this if the number of iterations may be kfold-ed.
llvm::APSInt Result;
bool IsConstant =
LastIteration.get()->isIntegerConstantExpr(Result, SemaRef.Context);
ExprResult CalcLastIteration;
if (!IsConstant) {
ExprResult SaveRef =
tryBuildCapture(SemaRef, LastIteration.get(), Captures);
LastIteration = SaveRef;
// Prepare SaveRef + 1.
NumIterations = SemaRef.BuildBinOp(
CurScope, SaveRef.get()->getExprLoc(), BO_Add, SaveRef.get(),
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get());
if (!NumIterations.isUsable())
return 0;
SourceLocation InitLoc = IterSpaces[0].InitSrcRange.getBegin();
// Build variables passed into runtime, necessary for worksharing directives.
ExprResult LB, UB, IL, ST, EUB, CombLB, CombUB, PrevLB, PrevUB, CombEUB;
if (isOpenMPWorksharingDirective(DKind) || isOpenMPTaskLoopDirective(DKind) ||
isOpenMPDistributeDirective(DKind)) {
// Lower bound variable, initialized with zero.
VarDecl *LBDecl = buildVarDecl(SemaRef, InitLoc, VType, "");
LB = buildDeclRefExpr(SemaRef, LBDecl, VType, InitLoc);
SemaRef.ActOnIntegerConstant(InitLoc, 0).get(),
/*DirectInit*/ false);
// Upper bound variable, initialized with last iteration number.
VarDecl *UBDecl = buildVarDecl(SemaRef, InitLoc, VType, ".omp.ub");
UB = buildDeclRefExpr(SemaRef, UBDecl, VType, InitLoc);
SemaRef.AddInitializerToDecl(UBDecl, LastIteration.get(),
/*DirectInit*/ false);
// A 32-bit variable-flag where runtime returns 1 for the last iteration.
// This will be used to implement clause 'lastprivate'.
QualType Int32Ty = SemaRef.Context.getIntTypeForBitwidth(32, true);
VarDecl *ILDecl = buildVarDecl(SemaRef, InitLoc, Int32Ty, ".omp.is_last");
IL = buildDeclRefExpr(SemaRef, ILDecl, Int32Ty, InitLoc);
SemaRef.ActOnIntegerConstant(InitLoc, 0).get(),
/*DirectInit*/ false);
// Stride variable returned by runtime (we initialize it to 1 by default).
VarDecl *STDecl =
buildVarDecl(SemaRef, InitLoc, StrideVType, ".omp.stride");
ST = buildDeclRefExpr(SemaRef, STDecl, StrideVType, InitLoc);
SemaRef.ActOnIntegerConstant(InitLoc, 1).get(),
/*DirectInit*/ false);
// Build expression: UB = min(UB, LastIteration)
// It is necessary for CodeGen of directives with static scheduling.
ExprResult IsUBGreater = SemaRef.BuildBinOp(CurScope, InitLoc, BO_GT,
UB.get(), LastIteration.get());
ExprResult CondOp = SemaRef.ActOnConditionalOp(
LastIteration.get()->getExprLoc(), InitLoc, IsUBGreater.get(),
LastIteration.get(), UB.get());
EUB = SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, UB.get(),
EUB = SemaRef.ActOnFinishFullExpr(EUB.get(), /*DiscardedValue*/ false);
// If we have a combined directive that combines 'distribute', 'for' or
// 'simd' we need to be able to access the bounds of the schedule of the
// enclosing region. E.g. in 'distribute parallel for' the bounds obtained
// by scheduling 'distribute' have to be passed to the schedule of 'for'.
if (isOpenMPLoopBoundSharingDirective(DKind)) {
// Lower bound variable, initialized with zero.
VarDecl *CombLBDecl =
buildVarDecl(SemaRef, InitLoc, VType, "");
CombLB = buildDeclRefExpr(SemaRef, CombLBDecl, VType, InitLoc);
CombLBDecl, SemaRef.ActOnIntegerConstant(InitLoc, 0).get(),
/*DirectInit*/ false);
// Upper bound variable, initialized with last iteration number.
VarDecl *CombUBDecl =
buildVarDecl(SemaRef, InitLoc, VType, ".omp.comb.ub");
CombUB = buildDeclRefExpr(SemaRef, CombUBDecl, VType, InitLoc);
SemaRef.AddInitializerToDecl(CombUBDecl, LastIteration.get(),
/*DirectInit*/ false);
ExprResult CombIsUBGreater = SemaRef.BuildBinOp(
CurScope, InitLoc, BO_GT, CombUB.get(), LastIteration.get());
ExprResult CombCondOp =
SemaRef.ActOnConditionalOp(InitLoc, InitLoc, CombIsUBGreater.get(),
LastIteration.get(), CombUB.get());
CombEUB = SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, CombUB.get(),
CombEUB =
SemaRef.ActOnFinishFullExpr(CombEUB.get(), /*DiscardedValue*/ false);
const CapturedDecl *CD = cast<CapturedStmt>(AStmt)->getCapturedDecl();
// We expect to have at least 2 more parameters than the 'parallel'
// directive does - the lower and upper bounds of the previous schedule.
assert(CD->getNumParams() >= 4 &&
"Unexpected number of parameters in loop combined directive");
// Set the proper type for the bounds given what we learned from the
// enclosed loops.
ImplicitParamDecl *PrevLBDecl = CD->getParam(/*PrevLB=*/2);
ImplicitParamDecl *PrevUBDecl = CD->getParam(/*PrevUB=*/3);
// Previous lower and upper bounds are obtained from the region
// parameters.
PrevLB =
buildDeclRefExpr(SemaRef, PrevLBDecl, PrevLBDecl->getType(), InitLoc);
PrevUB =
buildDeclRefExpr(SemaRef, PrevUBDecl, PrevUBDecl->getType(), InitLoc);
// Build the iteration variable and its initialization before loop.
ExprResult IV;
ExprResult Init, CombInit;
VarDecl *IVDecl = buildVarDecl(SemaRef, InitLoc, RealVType, ".omp.iv");
IV = buildDeclRefExpr(SemaRef, IVDecl, RealVType, InitLoc);
Expr *RHS =
(isOpenMPWorksharingDirective(DKind) ||
isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind))
? LB.get()
: SemaRef.ActOnIntegerConstant(SourceLocation(), 0).get();
Init = SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, IV.get(), RHS);
Init = SemaRef.ActOnFinishFullExpr(Init.get(), /*DiscardedValue*/ false);
if (isOpenMPLoopBoundSharingDirective(DKind)) {
Expr *CombRHS =
(isOpenMPWorksharingDirective(DKind) ||
isOpenMPTaskLoopDirective(DKind) ||
? CombLB.get()
: SemaRef.ActOnIntegerConstant(SourceLocation(), 0).get();
CombInit =
SemaRef.BuildBinOp(CurScope, InitLoc, BO_Assign, IV.get(), CombRHS);
CombInit =
SemaRef.ActOnFinishFullExpr(CombInit.get(), /*DiscardedValue*/ false);
bool UseStrictCompare =
RealVType->hasUnsignedIntegerRepresentation() &&
llvm::all_of(IterSpaces, [](const LoopIterationSpace &LIS) {
return LIS.IsStrictCompare;
// Loop condition (IV < NumIterations) or (IV <= UB or IV < UB + 1 (for
// unsigned IV)) for worksharing loops.
SourceLocation CondLoc = AStmt->getBeginLoc();
Expr *BoundUB = UB.get();
if (UseStrictCompare) {
BoundUB =
.BuildBinOp(CurScope, CondLoc, BO_Add, BoundUB,
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get())
BoundUB =
SemaRef.ActOnFinishFullExpr(BoundUB, /*DiscardedValue*/ false).get();
ExprResult Cond =
(isOpenMPWorksharingDirective(DKind) ||
isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind))
? SemaRef.BuildBinOp(CurScope, CondLoc,
UseStrictCompare ? BO_LT : BO_LE, IV.get(),
: SemaRef.BuildBinOp(CurScope, CondLoc, BO_LT, IV.get(),
ExprResult CombDistCond;
if (isOpenMPLoopBoundSharingDirective(DKind)) {
CombDistCond = SemaRef.BuildBinOp(CurScope, CondLoc, BO_LT, IV.get(),
ExprResult CombCond;
if (isOpenMPLoopBoundSharingDirective(DKind)) {
Expr *BoundCombUB = CombUB.get();
if (UseStrictCompare) {
BoundCombUB =
CurScope, CondLoc, BO_Add, BoundCombUB,
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get())
BoundCombUB =
SemaRef.ActOnFinishFullExpr(BoundCombUB, /*DiscardedValue*/ false)
CombCond =
SemaRef.BuildBinOp(CurScope, CondLoc, UseStrictCompare ? BO_LT : BO_LE,
IV.get(), BoundCombUB);
// Loop increment (IV = IV + 1)
SourceLocation IncLoc = AStmt->getBeginLoc();
ExprResult Inc =
SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(),
SemaRef.ActOnIntegerConstant(IncLoc, 1).get());
if (!Inc.isUsable())
return 0;
Inc = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, IV.get(), Inc.get());
Inc = SemaRef.ActOnFinishFullExpr(Inc.get(), /*DiscardedValue*/ false);
if (!Inc.isUsable())
return 0;
// Increments for worksharing loops (LB = LB + ST; UB = UB + ST).
// Used for directives with static scheduling.
// In combined construct, add combined version that use CombLB and CombUB
// base variables for the update
ExprResult NextLB, NextUB, CombNextLB, CombNextUB;
if (isOpenMPWorksharingDirective(DKind) || isOpenMPTaskLoopDirective(DKind) ||
isOpenMPDistributeDirective(DKind)) {
// LB + ST
NextLB = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, LB.get(), ST.get());
if (!NextLB.isUsable())
return 0;
// LB = LB + ST
NextLB =
SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, LB.get(), NextLB.get());
NextLB =
SemaRef.ActOnFinishFullExpr(NextLB.get(), /*DiscardedValue*/ false);
if (!NextLB.isUsable())
return 0;
// UB + ST
NextUB = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, UB.get(), ST.get());
if (!NextUB.isUsable())
return 0;
// UB = UB + ST
NextUB =
SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, UB.get(), NextUB.get());
NextUB =
SemaRef.ActOnFinishFullExpr(NextUB.get(), /*DiscardedValue*/ false);
if (!NextUB.isUsable())
return 0;
if (isOpenMPLoopBoundSharingDirective(DKind)) {
CombNextLB =
SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, CombLB.get(), ST.get());
if (!NextLB.isUsable())
return 0;
// LB = LB + ST
CombNextLB = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, CombLB.get(),
CombNextLB = SemaRef.ActOnFinishFullExpr(CombNextLB.get(),
/*DiscardedValue*/ false);
if (!CombNextLB.isUsable())
return 0;
// UB + ST
CombNextUB =
SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, CombUB.get(), ST.get());
if (!CombNextUB.isUsable())
return 0;
// UB = UB + ST
CombNextUB = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, CombUB.get(),
CombNextUB = SemaRef.ActOnFinishFullExpr(CombNextUB.get(),
/*DiscardedValue*/ false);
if (!CombNextUB.isUsable())
return 0;
// Create increment expression for distribute loop when combined in a same
// directive with for as IV = IV + ST; ensure upper bound expression based
// on PrevUB instead of NumIterations - used to implement 'for' when found
// in combination with 'distribute', like in 'distribute parallel for'
SourceLocation DistIncLoc = AStmt->getBeginLoc();
ExprResult DistCond, DistInc, PrevEUB, ParForInDistCond;
if (isOpenMPLoopBoundSharingDirective(DKind)) {
DistCond = SemaRef.BuildBinOp(
CurScope, CondLoc, UseStrictCompare ? BO_LT : BO_LE, IV.get(), BoundUB);
assert(DistCond.isUsable() && "distribute cond expr was not built");
DistInc =
SemaRef.BuildBinOp(CurScope, DistIncLoc, BO_Add, IV.get(), ST.get());
assert(DistInc.isUsable() && "distribute inc expr was not built");
DistInc = SemaRef.BuildBinOp(CurScope, DistIncLoc, BO_Assign, IV.get(),
DistInc =
SemaRef.ActOnFinishFullExpr(DistInc.get(), /*DiscardedValue*/ false);
assert(DistInc.isUsable() && "distribute inc expr was not built");
// Build expression: UB = min(UB, prevUB) for #for in composite or combined
// construct
SourceLocation DistEUBLoc = AStmt->getBeginLoc();
ExprResult IsUBGreater =
SemaRef.BuildBinOp(CurScope, DistEUBLoc, BO_GT, UB.get(), PrevUB.get());
ExprResult CondOp = SemaRef.ActOnConditionalOp(
DistEUBLoc, DistEUBLoc, IsUBGreater.get(), PrevUB.get(), UB.get());
PrevEUB = SemaRef.BuildBinOp(CurScope, DistIncLoc, BO_Assign, UB.get(),
PrevEUB =
SemaRef.ActOnFinishFullExpr(PrevEUB.get(), /*DiscardedValue*/ false);
// Build IV <= PrevUB or IV < PrevUB + 1 for unsigned IV to be used in
// parallel for is in combination with a distribute directive with
// schedule(static, 1)
Expr *BoundPrevUB = PrevUB.get();
if (UseStrictCompare) {
BoundPrevUB =
CurScope, CondLoc, BO_Add, BoundPrevUB,
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get())
BoundPrevUB =
SemaRef.ActOnFinishFullExpr(BoundPrevUB, /*DiscardedValue*/ false)
ParForInDistCond =
SemaRef.BuildBinOp(CurScope, CondLoc, UseStrictCompare ? BO_LT : BO_LE,
IV.get(), BoundPrevUB);
// Build updates and final values of the loop counters.
bool HasErrors = false;
// We implement the following algorithm for obtaining the
// original loop iteration variable values based on the
// value of the collapsed loop iteration variable IV.
// Let n+1 be the number of collapsed loops in the nest.
// Iteration variables (I0, I1, .... In)
// Iteration counts (N0, N1, ... Nn)
// Acc = IV;
// To compute Ik for loop k, 0 <= k <= n, generate:
// Prod = N(k+1) * N(k+2) * ... * Nn;
// Ik = Acc / Prod;
// Acc -= Ik * Prod;
ExprResult Acc = IV;
for (unsigned int Cnt = 0; Cnt < NestedLoopCount; ++Cnt) {
LoopIterationSpace &IS = IterSpaces[Cnt];
SourceLocation UpdLoc = IS.IncSrcRange.getBegin();
ExprResult Iter;
// Compute prod
ExprResult Prod =
SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get();
for (unsigned int K = Cnt+1; K < NestedLoopCount; ++K)
Prod = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Mul, Prod.get(),
// Iter = Acc / Prod
// If there is at least one more inner loop to avoid
// multiplication by 1.
if (Cnt + 1 < NestedLoopCount)
Iter = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Div,
Acc.get(), Prod.get());
Iter = Acc;
if (!Iter.isUsable()) {
HasErrors = true;
// Update Acc:
// Acc -= Iter * Prod
// Check if there is at least one more inner loop to avoid
// multiplication by 1.
if (Cnt + 1 < NestedLoopCount)
Prod = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Mul,
Iter.get(), Prod.get());
Prod = Iter;
Acc = SemaRef.BuildBinOp(CurScope, UpdLoc, BO_Sub,
Acc.get(), Prod.get());
// Build update: IS.CounterVar(Private) = IS.Start + Iter * IS.Step
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(IS.CounterVar)->getDecl());
DeclRefExpr *CounterVar = buildDeclRefExpr(
SemaRef, VD, IS.CounterVar->getType(), IS.CounterVar->getExprLoc(),
ExprResult Init =
buildCounterInit(SemaRef, CurScope, UpdLoc, CounterVar,
IS.CounterInit, IS.IsNonRectangularLB, Captures);
if (!Init.isUsable()) {
HasErrors = true;
ExprResult Update = buildCounterUpdate(
SemaRef, CurScope, UpdLoc, CounterVar, IS.CounterInit, Iter,
IS.CounterStep, IS.Subtract, IS.IsNonRectangularLB, &Captures);
if (!Update.isUsable()) {
HasErrors = true;
// Build final: IS.CounterVar = IS.Start + IS.NumIters * IS.Step
ExprResult Final =
buildCounterUpdate(SemaRef, CurScope, UpdLoc, CounterVar,
IS.CounterInit, IS.NumIterations, IS.CounterStep,
IS.Subtract, IS.IsNonRectangularLB, &Captures);
if (!Final.isUsable()) {
HasErrors = true;
if (!Update.isUsable() || !Final.isUsable()) {
HasErrors = true;
// Save results
Built.Counters[Cnt] = IS.CounterVar;
Built.PrivateCounters[Cnt] = IS.PrivateCounterVar;
Built.Inits[Cnt] = Init.get();
Built.Updates[Cnt] = Update.get();
Built.Finals[Cnt] = Final.get();
Built.DependentCounters[Cnt] = nullptr;
Built.DependentInits[Cnt] = nullptr;
Built.FinalsConditions[Cnt] = nullptr;
if (IS.IsNonRectangularLB || IS.IsNonRectangularUB) {
Built.DependentCounters[Cnt] =
Built.Counters[NestedLoopCount - 1 - IS.LoopDependentIdx];
Built.DependentInits[Cnt] =
Built.Inits[NestedLoopCount - 1 - IS.LoopDependentIdx];
Built.FinalsConditions[Cnt] = IS.FinalCondition;
if (HasErrors)
return 0;
// Save results
Built.IterationVarRef = IV.get();
Built.LastIteration = LastIteration.get();
Built.NumIterations = NumIterations.get();
Built.CalcLastIteration = SemaRef
Built.PreCond = PreCond.get();
Built.PreInits = buildPreInits(C, Captures);
Built.Cond = Cond.get();
Built.Init = Init.get();
Built.Inc = Inc.get();
Built.LB = LB.get();
Built.UB = UB.get();
Built.IL = IL.get();
Built.ST = ST.get();
Built.EUB = EUB.get();
Built.NLB = NextLB.get();
Built.NUB = NextUB.get();
Built.PrevLB = PrevLB.get();
Built.PrevUB = PrevUB.get();
Built.DistInc = DistInc.get();
Built.PrevEUB = PrevEUB.get();
Built.DistCombinedFields.LB = CombLB.get();
Built.DistCombinedFields.UB = CombUB.get();
Built.DistCombinedFields.EUB = CombEUB.get();
Built.DistCombinedFields.Init = CombInit.get();
Built.DistCombinedFields.Cond = CombCond.get();
Built.DistCombinedFields.NLB = CombNextLB.get();
Built.DistCombinedFields.NUB = CombNextUB.get();
Built.DistCombinedFields.DistCond = CombDistCond.get();
Built.DistCombinedFields.ParForInDistCond = ParForInDistCond.get();
return NestedLoopCount;
static Expr *getCollapseNumberExpr(ArrayRef<OMPClause *> Clauses) {
auto CollapseClauses =
if (CollapseClauses.begin() != CollapseClauses.end())
return (*CollapseClauses.begin())->getNumForLoops();
return nullptr;
static Expr *getOrderedNumberExpr(ArrayRef<OMPClause *> Clauses) {
auto OrderedClauses =
if (OrderedClauses.begin() != OrderedClauses.end())
return (*OrderedClauses.begin())->getNumForLoops();
return nullptr;
static bool checkSimdlenSafelenSpecified(Sema &S,
const ArrayRef<OMPClause *> Clauses) {
const OMPSafelenClause *Safelen = nullptr;
const OMPSimdlenClause *Simdlen = nullptr;
for (const OMPClause *Clause : Clauses) {
if (Clause->getClauseKind() == OMPC_safelen)
Safelen = cast<OMPSafelenClause>(Clause);
else if (Clause->getClauseKind() == OMPC_simdlen)
Simdlen = cast<OMPSimdlenClause>(Clause);
if (Safelen && Simdlen)
if (Simdlen && Safelen) {
const Expr *SimdlenLength = Simdlen->getSimdlen();
const Expr *SafelenLength = Safelen->getSafelen();
if (SimdlenLength->isValueDependent() || SimdlenLength->isTypeDependent() ||
SimdlenLength->isInstantiationDependent() ||
return false;
if (SafelenLength->isValueDependent() || SafelenLength->isTypeDependent() ||
SafelenLength->isInstantiationDependent() ||
return false;
Expr::EvalResult SimdlenResult, SafelenResult;
SimdlenLength->EvaluateAsInt(SimdlenResult, S.Context);
SafelenLength->EvaluateAsInt(SafelenResult, S.Context);
llvm::APSInt SimdlenRes = SimdlenResult.Val.getInt();
llvm::APSInt SafelenRes = SafelenResult.Val.getInt();
// OpenMP 4.5 [2.8.1, simd Construct, Restrictions]
// If both simdlen and safelen clauses are specified, the value of the
// simdlen parameter must be less than or equal to the value of the safelen
// parameter.
if (SimdlenRes > SafelenRes) {
<< SimdlenLength->getSourceRange() << SafelenLength->getSourceRange();
return true;
return false;
Sema::ActOnOpenMPSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
SourceLocation StartLoc, SourceLocation EndLoc,
VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_simd, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp simd loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPSimdDirective::Create(Context, StartLoc, EndLoc, NestedLoopCount,
Clauses, AStmt, B);
Sema::ActOnOpenMPForDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
SourceLocation StartLoc, SourceLocation EndLoc,
VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_for, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
return OMPForDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
StmtResult Sema::ActOnOpenMPForSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_for_simd, getCollapseNumberExpr(Clauses),
getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for simd loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPForSimdDirective::Create(Context, StartLoc, EndLoc, NestedLoopCount,
Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
auto BaseStmt = AStmt;
while (auto *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
BaseStmt = CS->getCapturedStmt();
if (auto *C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
auto S = C->children();
if (S.begin() == S.end())
return StmtError();
// All associated statements must be '#pragma omp section' except for
// the first one.
for (Stmt *SectionStmt : llvm::make_range(std::next(S.begin()), S.end())) {
if (!SectionStmt || !isa<OMPSectionDirective>(SectionStmt)) {
if (SectionStmt)
return StmtError();
} else {
Diag(AStmt->getBeginLoc(), diag::err_omp_sections_not_compound_stmt);
return StmtError();
return OMPSectionsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
StmtResult Sema::ActOnOpenMPSectionDirective(Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
return OMPSectionDirective::Create(Context, StartLoc, EndLoc, AStmt,
StmtResult Sema::ActOnOpenMPSingleDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
// OpenMP [2.7.3, single Construct, Restrictions]
// The copyprivate clause must not be used with the nowait clause.
const OMPClause *Nowait = nullptr;
const OMPClause *Copyprivate = nullptr;
for (const OMPClause *Clause : Clauses) {
if (Clause->getClauseKind() == OMPC_nowait)
Nowait = Clause;
else if (Clause->getClauseKind() == OMPC_copyprivate)
Copyprivate = Clause;
if (Copyprivate && Nowait) {
Diag(Nowait->getBeginLoc(), diag::note_omp_nowait_clause_here);
return StmtError();
return OMPSingleDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
StmtResult Sema::ActOnOpenMPMasterDirective(Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
return OMPMasterDirective::Create(Context, StartLoc, EndLoc, AStmt);
StmtResult Sema::ActOnOpenMPCriticalDirective(
const DeclarationNameInfo &DirName, ArrayRef<OMPClause *> Clauses,
Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
bool ErrorFound = false;
llvm::APSInt Hint;
SourceLocation HintLoc;
bool DependentHint = false;
for (const OMPClause *C : Clauses) {
if (C->getClauseKind() == OMPC_hint) {
if (!DirName.getName()) {
Diag(C->getBeginLoc(), diag::err_omp_hint_clause_no_name);
ErrorFound = true;
Expr *E = cast<OMPHintClause>(C)->getHint();
if (E->isTypeDependent() || E->isValueDependent() ||
E->isInstantiationDependent()) {
DependentHint = true;
} else {
Hint = E->EvaluateKnownConstInt(Context);
HintLoc = C->getBeginLoc();
if (ErrorFound)
return StmtError();
const auto Pair = DSAStack->getCriticalWithHint(DirName);
if (Pair.first && DirName.getName() && !DependentHint) {
if (llvm::APSInt::compareValues(Hint, Pair.second) != 0) {
Diag(StartLoc, diag::err_omp_critical_with_hint);
if (HintLoc.isValid())
Diag(HintLoc, diag::note_omp_critical_hint_here)
<< 0 << Hint.toString(/*Radix=*/10, /*Signed=*/false);
Diag(StartLoc, diag::note_omp_critical_no_hint) << 0;
if (const auto *C = Pair.first->getSingleClause<OMPHintClause>()) {
Diag(C->getBeginLoc(), diag::note_omp_critical_hint_here)
<< 1
<< C->getHint()->EvaluateKnownConstInt(Context).toString(
/*Radix=*/10, /*Signed=*/false);
} else {
Diag(Pair.first->getBeginLoc(), diag::note_omp_critical_no_hint) << 1;
auto *Dir = OMPCriticalDirective::Create(Context, DirName, StartLoc, EndLoc,
Clauses, AStmt);
if (!Pair.first && DirName.getName() && !DependentHint)
DSAStack->addCriticalWithHint(Dir, Hint);
return Dir;
StmtResult Sema::ActOnOpenMPParallelForDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_parallel_for, getCollapseNumberExpr(Clauses),
getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp parallel for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
return OMPParallelForDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
StmtResult Sema::ActOnOpenMPParallelForSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_parallel_for_simd, getCollapseNumberExpr(Clauses),
getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPParallelForSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
Sema::ActOnOpenMPParallelMasterDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
return OMPParallelMasterDirective::Create(
Context, StartLoc, EndLoc, Clauses, AStmt,
Sema::ActOnOpenMPParallelSectionsDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
auto BaseStmt = AStmt;
while (auto *CS = dyn_cast_or_null<CapturedStmt>(BaseStmt))
BaseStmt = CS->getCapturedStmt();
if (auto *C = dyn_cast_or_null<CompoundStmt>(BaseStmt)) {
auto S = C->children();
if (S.begin() == S.end())
return StmtError();
// All associated statements must be '#pragma omp section' except for
// the first one.
for (Stmt *SectionStmt : llvm::make_range(std::next(S.begin()), S.end())) {
if (!SectionStmt || !isa<OMPSectionDirective>(SectionStmt)) {
if (SectionStmt)
return StmtError();
} else {
return StmtError();
return OMPParallelSectionsDirective::Create(
Context, StartLoc, EndLoc, Clauses, AStmt,
DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
/// detach and mergeable clauses are mutially exclusive, check for it.
static bool checkDetachMergeableClauses(Sema &S,
ArrayRef<OMPClause *> Clauses) {
const OMPClause *PrevClause = nullptr;
bool ErrorFound = false;
for (const OMPClause *C : Clauses) {
if (C->getClauseKind() == OMPC_detach ||
C->getClauseKind() == OMPC_mergeable) {
if (!PrevClause) {
PrevClause = C;
} else if (PrevClause->getClauseKind() != C->getClauseKind()) {
S.Diag(C->getBeginLoc(), diag::err_omp_clauses_mutually_exclusive)
<< getOpenMPClauseName(C->getClauseKind())
<< getOpenMPClauseName(PrevClause->getClauseKind());
S.Diag(PrevClause->getBeginLoc(), diag::note_omp_previous_clause)
<< getOpenMPClauseName(PrevClause->getClauseKind());
ErrorFound = true;
return ErrorFound;
StmtResult Sema::ActOnOpenMPTaskDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
// OpenMP 5.0, 2.10.1 task Construct
// If a detach clause appears on the directive, then a mergeable clause cannot
// appear on the same directive.
if (checkDetachMergeableClauses(*this, Clauses))
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
return OMPTaskDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
StmtResult Sema::ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc,
SourceLocation EndLoc) {
return OMPTaskyieldDirective::Create(Context, StartLoc, EndLoc);
StmtResult Sema::ActOnOpenMPBarrierDirective(SourceLocation StartLoc,
SourceLocation EndLoc) {
return OMPBarrierDirective::Create(Context, StartLoc, EndLoc);
StmtResult Sema::ActOnOpenMPTaskwaitDirective(SourceLocation StartLoc,
SourceLocation EndLoc) {
return OMPTaskwaitDirective::Create(Context, StartLoc, EndLoc);
StmtResult Sema::ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
return OMPTaskgroupDirective::Create(Context, StartLoc, EndLoc, Clauses,
StmtResult Sema::ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
SourceLocation StartLoc,
SourceLocation EndLoc) {
OMPFlushClause *FC = nullptr;
OMPClause *OrderClause = nullptr;
for (OMPClause *C : Clauses) {
if (C->getClauseKind() == OMPC_flush)
FC = cast<OMPFlushClause>(C);
OrderClause = C;
OpenMPClauseKind MemOrderKind = OMPC_unknown;
SourceLocation MemOrderLoc;
for (const OMPClause *C : Clauses) {
if (C->getClauseKind() == OMPC_acq_rel ||
C->getClauseKind() == OMPC_acquire ||
C->getClauseKind() == OMPC_release) {
if (MemOrderKind != OMPC_unknown) {
Diag(C->getBeginLoc(), diag::err_omp_several_mem_order_clauses)
<< getOpenMPDirectiveName(OMPD_flush) << 1
<< SourceRange(C->getBeginLoc(), C->getEndLoc());
Diag(MemOrderLoc, diag::note_omp_previous_mem_order_clause)
<< getOpenMPClauseName(MemOrderKind);
} else {
MemOrderKind = C->getClauseKind();
MemOrderLoc = C->getBeginLoc();
if (FC && OrderClause) {
Diag(FC->getLParenLoc(), diag::err_omp_flush_order_clause_and_list)
<< getOpenMPClauseName(OrderClause->getClauseKind());
Diag(OrderClause->getBeginLoc(), diag::note_omp_flush_order_clause_here)
<< getOpenMPClauseName(OrderClause->getClauseKind());
return StmtError();
return OMPFlushDirective::Create(Context, StartLoc, EndLoc, Clauses);
StmtResult Sema::ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (Clauses.empty()) {
Diag(StartLoc, diag::err_omp_depobj_expected);
return StmtError();
} else if (Clauses[0]->getClauseKind() != OMPC_depobj) {
Diag(Clauses[0]->getBeginLoc(), diag::err_omp_depobj_expected);
return StmtError();
// Only depobj expression and another single clause is allowed.
if (Clauses.size() > 2) {
return StmtError();
} else if (Clauses.size() < 1) {
Diag(Clauses[0]->getEndLoc(), diag::err_omp_depobj_single_clause_expected);
return StmtError();
return OMPDepobjDirective::Create(Context, StartLoc, EndLoc, Clauses);
StmtResult Sema::ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
SourceLocation StartLoc,
SourceLocation EndLoc) {
// Check that exactly one clause is specified.
if (Clauses.size() != 1) {
Diag(Clauses.empty() ? EndLoc : Clauses[1]->getBeginLoc(),
return StmtError();
// Check that scan directive is used in the scopeof the OpenMP loop body.
if (Scope *S = DSAStack->getCurScope()) {
Scope *ParentS = S->getParent();
if (!ParentS || ParentS->getParent() != ParentS->getBreakParent() ||
return StmtError(Diag(StartLoc, diag::err_omp_orphaned_device_directive)
<< getOpenMPDirectiveName(OMPD_scan) << 5);
// Check that only one instance of scan directives is used in the same outer
// region.
if (DSAStack->doesParentHasScanDirective()) {
Diag(StartLoc, diag::err_omp_several_directives_in_region) << "scan";
<< "scan";
return StmtError();
return OMPScanDirective::Create(Context, StartLoc, EndLoc, Clauses);
StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
const OMPClause *DependFound = nullptr;
const OMPClause *DependSourceClause = nullptr;
const OMPClause *DependSinkClause = nullptr;
bool ErrorFound = false;
const OMPThreadsClause *TC = nullptr;
const OMPSIMDClause *SC = nullptr;
for (const OMPClause *C : Clauses) {
if (auto *DC = dyn_cast<OMPDependClause>(C)) {
DependFound = C;
if (DC->getDependencyKind() == OMPC_DEPEND_source) {
if (DependSourceClause) {
Diag(C->getBeginLoc(), diag::err_omp_more_one_clause)
<< getOpenMPDirectiveName(OMPD_ordered)
<< getOpenMPClauseName(OMPC_depend) << 2;
ErrorFound = true;
} else {
DependSourceClause = C;
if (DependSinkClause) {
Diag(C->getBeginLoc(), diag::err_omp_depend_sink_source_not_allowed)
<< 0;
ErrorFound = true;
} else if (DC->getDependencyKind() == OMPC_DEPEND_sink) {
if (DependSourceClause) {
Diag(C->getBeginLoc(), diag::err_omp_depend_sink_source_not_allowed)
<< 1;
ErrorFound = true;
DependSinkClause = C;
} else if (C->getClauseKind() == OMPC_threads) {
TC = cast<OMPThreadsClause>(C);
} else if (C->getClauseKind() == OMPC_simd) {
SC = cast<OMPSIMDClause>(C);
if (!ErrorFound && !SC &&
isOpenMPSimdDirective(DSAStack->getParentDirective())) {
// OpenMP [2.8.1,simd Construct, Restrictions]
// An ordered construct with the simd clause is the only OpenMP construct
// that can appear in the simd region.
Diag(StartLoc, diag::err_omp_prohibited_region_simd)
<< (LangOpts.OpenMP >= 50 ? 1 : 0);
ErrorFound = true;
} else if (DependFound && (TC || SC)) {
Diag(DependFound->getBeginLoc(), diag::err_omp_depend_clause_thread_simd)
<< getOpenMPClauseName(TC ? TC->getClauseKind() : SC->getClauseKind());
ErrorFound = true;
} else if (DependFound && !DSAStack->getParentOrderedRegionParam().first) {
ErrorFound = true;
} else if (TC || Clauses.empty()) {
if (const Expr *Param = DSAStack->getParentOrderedRegionParam().first) {
SourceLocation ErrLoc = TC ? TC->getBeginLoc() : StartLoc;
Diag(ErrLoc, diag::err_omp_ordered_directive_with_param)
<< (TC != nullptr);
Diag(Param->getBeginLoc(), diag::note_omp_ordered_param) << 1;
ErrorFound = true;
if ((!AStmt && !DependFound) || ErrorFound)
return StmtError();
// OpenMP 5.0, 2.17.9, ordered Construct, Restrictions.
// During execution of an iteration of a worksharing-loop or a loop nest
// within a worksharing-loop, simd, or worksharing-loop SIMD region, a thread
// must not execute more than one ordered region corresponding to an ordered
// construct without a depend clause.
if (!DependFound) {
if (DSAStack->doesParentHasOrderedDirective()) {
Diag(StartLoc, diag::err_omp_several_directives_in_region) << "ordered";
<< "ordered";
return StmtError();
if (AStmt) {
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
return OMPOrderedDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
namespace {
/// Helper class for checking expression in 'omp atomic [update]'
/// construct.
class OpenMPAtomicUpdateChecker {
/// Error results for atomic update expressions.
enum ExprAnalysisErrorCode {
/// A statement is not an expression statement.
/// Expression is not builtin binary or unary operation.
/// Unary operation is not post-/pre- increment/decrement operation.
/// An expression is not of scalar type.
/// A binary operation is not an assignment operation.
/// RHS part of the binary operation is not a binary expression.
/// RHS part is not additive/multiplicative/shift/biwise binary
/// expression.
/// RHS binary operation does not have reference to the updated LHS
/// part.
/// No errors is found.
/// Reference to Sema.
Sema &SemaRef;
/// A location for note diagnostics (when error is found).
SourceLocation NoteLoc;
/// 'x' lvalue part of the source atomic expression.
Expr *X;
/// 'expr' rvalue part of the source atomic expression.
Expr *E;
/// Helper expression of the form
/// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or
/// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'.
Expr *UpdateExpr;
/// Is 'x' a LHS in a RHS part of full update expression. It is
/// important for non-associative operations.
bool IsXLHSInRHSPart;
BinaryOperatorKind Op;
SourceLocation OpLoc;
/// true if the source expression is a postfix unary operation, false
/// if it is a prefix unary operation.
bool IsPostfixUpdate;
OpenMPAtomicUpdateChecker(Sema &SemaRef)
: SemaRef(SemaRef), X(nullptr), E(nullptr), UpdateExpr(nullptr),
IsXLHSInRHSPart(false), Op(BO_PtrMemD), IsPostfixUpdate(false) {}
/// Check specified statement that it is suitable for 'atomic update'
/// constructs and extract 'x', 'expr' and Operation from the original
/// expression. If DiagId and NoteId == 0, then only check is performed
/// without error notification.
/// \param DiagId Diagnostic which should be emitted if error is found.
/// \param NoteId Diagnostic note for the main error message.
/// \return true if statement is not an update expression, false otherwise.
bool checkStatement(Stmt *S, unsigned DiagId = 0, unsigned NoteId = 0);
/// Return the 'x' lvalue part of the source atomic expression.
Expr *getX() const { return X; }
/// Return the 'expr' rvalue part of the source atomic expression.
Expr *getExpr() const { return E; }
/// Return the update expression used in calculation of the updated
/// value. Always has form 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or
/// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'.
Expr *getUpdateExpr() const { return UpdateExpr; }
/// Return true if 'x' is LHS in RHS part of full update expression,
/// false otherwise.
bool isXLHSInRHSPart() const { return IsXLHSInRHSPart; }
/// true if the source expression is a postfix unary operation, false
/// if it is a prefix unary operation.
bool isPostfixUpdate() const { return IsPostfixUpdate; }
bool checkBinaryOperation(BinaryOperator *AtomicBinOp, unsigned DiagId = 0,
unsigned NoteId = 0);
} // namespace
bool OpenMPAtomicUpdateChecker::checkBinaryOperation(
BinaryOperator *AtomicBinOp, unsigned DiagId, unsigned NoteId) {
ExprAnalysisErrorCode ErrorFound = NoError;
SourceLocation ErrorLoc, NoteLoc;
SourceRange ErrorRange, NoteRange;
// Allowed constructs are:
// x = x binop expr;
// x = expr binop x;
if (AtomicBinOp->getOpcode() == BO_Assign) {
X = AtomicBinOp->getLHS();
if (const auto *AtomicInnerBinOp = dyn_cast<BinaryOperator>(
AtomicBinOp->getRHS()->IgnoreParenImpCasts())) {
if (AtomicInnerBinOp->isMultiplicativeOp() ||
AtomicInnerBinOp->isAdditiveOp() || AtomicInnerBinOp->isShiftOp() ||
AtomicInnerBinOp->isBitwiseOp()) {
Op = AtomicInnerBinOp->getOpcode();
OpLoc = AtomicInnerBinOp->getOperatorLoc();
Expr *LHS = AtomicInnerBinOp->getLHS();
Expr *RHS = AtomicInnerBinOp->getRHS();
llvm::FoldingSetNodeID XId, LHSId, RHSId;
X->IgnoreParenImpCasts()->Profile(XId, SemaRef.getASTContext(),
LHS->IgnoreParenImpCasts()->Profile(LHSId, SemaRef.getASTContext(),
RHS->IgnoreParenImpCasts()->Profile(RHSId, SemaRef.getASTContext(),
if (XId == LHSId) {
E = RHS;
IsXLHSInRHSPart = true;
} else if (XId == RHSId) {
E = LHS;
IsXLHSInRHSPart = false;
} else {
ErrorLoc = AtomicInnerBinOp->getExprLoc();
ErrorRange = AtomicInnerBinOp->getSourceRange();
NoteLoc = X->getExprLoc();
NoteRange = X->getSourceRange();
ErrorFound = NotAnUpdateExpression;
} else {
ErrorLoc = AtomicInnerBinOp->getExprLoc();
ErrorRange = AtomicInnerBinOp->getSourceRange();
NoteLoc = AtomicInnerBinOp->getOperatorLoc();
NoteRange = SourceRange(NoteLoc, NoteLoc);
ErrorFound = NotABinaryOperator;
} else {
NoteLoc = ErrorLoc = AtomicBinOp->getRHS()->getExprLoc();
NoteRange = ErrorRange = AtomicBinOp->getRHS()->getSourceRange();
ErrorFound = NotABinaryExpression;
} else {
ErrorLoc = AtomicBinOp->getExprLoc();
ErrorRange = AtomicBinOp->getSourceRange();
NoteLoc = AtomicBinOp->getOperatorLoc();
NoteRange = SourceRange(NoteLoc, NoteLoc);
ErrorFound = NotAnAssignmentOp;
if (ErrorFound != NoError && DiagId != 0 && NoteId != 0) {
SemaRef.Diag(ErrorLoc, DiagId) << ErrorRange;
SemaRef.Diag(NoteLoc, NoteId) << ErrorFound << NoteRange;
return true;
if (SemaRef.CurContext->isDependentContext())
E = X = UpdateExpr = nullptr;
return ErrorFound != NoError;
bool OpenMPAtomicUpdateChecker::checkStatement(Stmt *S, unsigned DiagId,
unsigned NoteId) {
ExprAnalysisErrorCode ErrorFound = NoError;
SourceLocation ErrorLoc, NoteLoc;
SourceRange ErrorRange, NoteRange;
// Allowed constructs are:
// x++;
// x--;
// ++x;
// --x;
// x binop= expr;
// x = x binop expr;
// x = expr binop x;
if (auto *AtomicBody = dyn_cast<Expr>(S)) {
AtomicBody = AtomicBody->IgnoreParenImpCasts();
if (AtomicBody->getType()->isScalarType() ||
AtomicBody->isInstantiationDependent()) {
if (const auto *AtomicCompAssignOp = dyn_cast<CompoundAssignOperator>(
AtomicBody->IgnoreParenImpCasts())) {
// Check for Compound Assignment Operation
Op = BinaryOperator::getOpForCompoundAssignment(
OpLoc = AtomicCompAssignOp->getOperatorLoc();
E = AtomicCompAssignOp->getRHS();
X = AtomicCompAssignOp->getLHS()->IgnoreParens();
IsXLHSInRHSPart = true;
} else if (auto *AtomicBinOp = dyn_cast<BinaryOperator>(
AtomicBody->IgnoreParenImpCasts())) {
// Check for Binary Operation
if (checkBinaryOperation(AtomicBinOp, DiagId, NoteId))
return true;
} else if (const auto *AtomicUnaryOp = dyn_cast<UnaryOperator>(
AtomicBody->IgnoreParenImpCasts())) {
// Check for Unary Operation
if (AtomicUnaryOp->isIncrementDecrementOp()) {
IsPostfixUpdate = AtomicUnaryOp->isPostfix();
Op = AtomicUnaryOp->isIncrementOp() ? BO_Add : BO_Sub;
OpLoc = AtomicUnaryOp->getOperatorLoc();
X = AtomicUnaryOp->getSubExpr()->IgnoreParens();
E = SemaRef.ActOnIntegerConstant(OpLoc, /*uint64_t Val=*/1).get();
IsXLHSInRHSPart = true;
} else {
ErrorFound = NotAnUnaryIncDecExpression;
ErrorLoc = AtomicUnaryOp->getExprLoc();
ErrorRange = AtomicUnaryOp->getSourceRange();
NoteLoc = AtomicUnaryOp->getOperatorLoc();
NoteRange = SourceRange(NoteLoc, NoteLoc);
} else if (!AtomicBody->isInstantiationDependent()) {
ErrorFound = NotABinaryOrUnaryExpression;
NoteLoc = ErrorLoc = AtomicBody->getExprLoc();
NoteRange = ErrorRange = AtomicBody->getSourceRange();
} else {
ErrorFound = NotAScalarType;
NoteLoc = ErrorLoc = AtomicBody->getBeginLoc();
NoteRange = ErrorRange = SourceRange(NoteLoc, NoteLoc);
} else {
ErrorFound = NotAnExpression;
NoteLoc = ErrorLoc = S->getBeginLoc();
NoteRange = ErrorRange = SourceRange(NoteLoc, NoteLoc);
if (ErrorFound != NoError && DiagId != 0 && NoteId != 0) {
SemaRef.Diag(ErrorLoc, DiagId) << ErrorRange;
SemaRef.Diag(NoteLoc, NoteId) << ErrorFound << NoteRange;
return true;
if (SemaRef.CurContext->isDependentContext())
E = X = UpdateExpr = nullptr;
if (ErrorFound == NoError && E && X) {
// Build an update expression of form 'OpaqueValueExpr(x) binop
// OpaqueValueExpr(expr)' or 'OpaqueValueExpr(expr) binop
// OpaqueValueExpr(x)' and then cast it to the type of the 'x' expression.
auto *OVEX = new (SemaRef.getASTContext())
OpaqueValueExpr(X->getExprLoc(), X->getType(), VK_RValue);
auto *OVEExpr = new (SemaRef.getASTContext())
OpaqueValueExpr(E->getExprLoc(), E->getType(), VK_RValue);
ExprResult Update =
SemaRef.CreateBuiltinBinOp(OpLoc, Op, IsXLHSInRHSPart ? OVEX : OVEExpr,
if (Update.isInvalid())
return true;
Update = SemaRef.PerformImplicitConversion(Update.get(), X->getType(),
if (Update.isInvalid())
return true;
UpdateExpr = Update.get();
return ErrorFound != NoError;
StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
// Register location of the first atomic directive.
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OpenMPClauseKind AtomicKind = OMPC_unknown;
SourceLocation AtomicKindLoc;
OpenMPClauseKind MemOrderKind = OMPC_unknown;
SourceLocation MemOrderLoc;
for (const OMPClause *C : Clauses) {
if (C->getClauseKind() == OMPC_read || C->getClauseKind() == OMPC_write ||
C->getClauseKind() == OMPC_update ||
C->getClauseKind() == OMPC_capture) {
if (AtomicKind != OMPC_unknown) {
Diag(C->getBeginLoc(), diag::err_omp_atomic_several_clauses)
<< SourceRange(C->getBeginLoc(), C->getEndLoc());
Diag(AtomicKindLoc, diag::note_omp_previous_mem_order_clause)
<< getOpenMPClauseName(AtomicKind);
} else {
AtomicKind = C->getClauseKind();
AtomicKindLoc = C->getBeginLoc();
if (C->getClauseKind() == OMPC_seq_cst ||
C->getClauseKind() == OMPC_acq_rel ||
C->getClauseKind() == OMPC_acquire ||
C->getClauseKind() == OMPC_release ||
C->getClauseKind() == OMPC_relaxed) {
if (MemOrderKind != OMPC_unknown) {
Diag(C->getBeginLoc(), diag::err_omp_several_mem_order_clauses)
<< getOpenMPDirectiveName(OMPD_atomic) << 0
<< SourceRange(C->getBeginLoc(), C->getEndLoc());
Diag(MemOrderLoc, diag::note_omp_previous_mem_order_clause)
<< getOpenMPClauseName(MemOrderKind);
} else {
MemOrderKind = C->getClauseKind();
MemOrderLoc = C->getBeginLoc();
// OpenMP 5.0, 2.17.7 atomic Construct, Restrictions
// If atomic-clause is read then memory-order-clause must not be acq_rel or
// release.
// If atomic-clause is write then memory-order-clause must not be acq_rel or
// acquire.
// If atomic-clause is update or not present then memory-order-clause must not
// be acq_rel or acquire.
if ((AtomicKind == OMPC_read &&
(MemOrderKind == OMPC_acq_rel || MemOrderKind == OMPC_release)) ||
((AtomicKind == OMPC_write || AtomicKind == OMPC_update ||
AtomicKind == OMPC_unknown) &&
(MemOrderKind == OMPC_acq_rel || MemOrderKind == OMPC_acquire))) {
SourceLocation Loc = AtomicKindLoc;
if (AtomicKind == OMPC_unknown)
Loc = StartLoc;
Diag(Loc, diag::err_omp_atomic_incompatible_mem_order_clause)
<< getOpenMPClauseName(AtomicKind)
<< (AtomicKind == OMPC_unknown ? 1 : 0)
<< getOpenMPClauseName(MemOrderKind);
Diag(MemOrderLoc, diag::note_omp_previous_mem_order_clause)
<< getOpenMPClauseName(MemOrderKind);
Stmt *Body = CS->getCapturedStmt();
if (auto *EWC = dyn_cast<ExprWithCleanups>(Body))
Body = EWC->getSubExpr();
Expr *X = nullptr;
Expr *V = nullptr;
Expr *E = nullptr;
Expr *UE = nullptr;
bool IsXLHSInRHSPart = false;
bool IsPostfixUpdate = false;
// OpenMP [2.12.6, atomic Construct]
// In the next expressions:
// * x and v (as applicable) are both l-value expressions with scalar type.
// * During the execution of an atomic region, multiple syntactic
// occurrences of x must designate the same storage location.
// * Neither of v and expr (as applicable) may access the storage location
// designated by x.
// * Neither of x and expr (as applicable) may access the storage location
// designated by v.
// * expr is an expression with scalar type.
// * binop is one of +, *, -, /, &, ^, |, <<, or >>.
// * binop, binop=, ++, and -- are not overloaded operators.
// * The expression x binop expr must be numerically equivalent to x binop
// (expr). This requirement is satisfied if the operators in expr have
// precedence greater than binop, or by using parentheses around expr or
// subexpressions of expr.
// * The expression expr binop x must be numerically equivalent to (expr)
// binop x. This requirement is satisfied if the operators in expr have
// precedence equal to or greater than binop, or by using parentheses around
// expr or subexpressions of expr.
// * For forms that allow multiple occurrences of x, the number of times
// that x is evaluated is unspecified.
if (AtomicKind == OMPC_read) {
enum {
} ErrorFound = NoError;
SourceLocation ErrorLoc, NoteLoc;
SourceRange ErrorRange, NoteRange;
// If clause is read:
// v = x;
if (const auto *AtomicBody = dyn_cast<Expr>(Body)) {
const auto *AtomicBinOp =
if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
X = AtomicBinOp->getRHS()->IgnoreParenImpCasts();
V = AtomicBinOp->getLHS()->IgnoreParenImpCasts();
if ((X->isInstantiationDependent() || X->getType()->isScalarType()) &&
(V->isInstantiationDependent() || V->getType()->isScalarType())) {
if (!X->isLValue() || !V->isLValue()) {
const Expr *NotLValueExpr = X->isLValue() ? V : X;
ErrorFound = NotAnLValue;
ErrorLoc = AtomicBinOp->getExprLoc();
ErrorRange = AtomicBinOp->getSourceRange();
NoteLoc = NotLValueExpr->getExprLoc();
NoteRange = NotLValueExpr->getSourceRange();
} else if (!X->isInstantiationDependent() ||
!V->isInstantiationDependent()) {
const Expr *NotScalarExpr =
(X->isInstantiationDependent() || X->getType()->isScalarType())
? V
: X;
ErrorFound = NotAScalarType;
ErrorLoc = AtomicBinOp->getExprLoc();
ErrorRange = AtomicBinOp->getSourceRange();
NoteLoc = NotScalarExpr->getExprLoc();
NoteRange = NotScalarExpr->getSourceRange();
} else if (!AtomicBody->isInstantiationDependent()) {
ErrorFound = NotAnAssignmentOp;
ErrorLoc = AtomicBody->getExprLoc();
ErrorRange = AtomicBody->getSourceRange();
NoteLoc = AtomicBinOp ? AtomicBinOp->getOperatorLoc()
: AtomicBody->getExprLoc();
NoteRange = AtomicBinOp ? AtomicBinOp->getSourceRange()
: AtomicBody->getSourceRange();
} else {
ErrorFound = NotAnExpression;
NoteLoc = ErrorLoc = Body->getBeginLoc();
NoteRange = ErrorRange = SourceRange(NoteLoc, NoteLoc);
if (ErrorFound != NoError) {
Diag(ErrorLoc, diag::err_omp_atomic_read_not_expression_statement)
<< ErrorRange;
Diag(NoteLoc, diag::note_omp_atomic_read_write) << ErrorFound
<< NoteRange;
return StmtError();
if (CurContext->isDependentContext())
V = X = nullptr;
} else if (AtomicKind == OMPC_write) {
enum {
} ErrorFound = NoError;
SourceLocation ErrorLoc, NoteLoc;
SourceRange ErrorRange, NoteRange;
// If clause is write:
// x = expr;
if (const auto *AtomicBody = dyn_cast<Expr>(Body)) {
const auto *AtomicBinOp =
if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
X = AtomicBinOp->getLHS();
E = AtomicBinOp->getRHS();
if ((X->isInstantiationDependent() || X->getType()->isScalarType()) &&
(E->isInstantiationDependent() || E->getType()->isScalarType())) {
if (!X->isLValue()) {
ErrorFound = NotAnLValue;
ErrorLoc = AtomicBinOp->getExprLoc();
ErrorRange = AtomicBinOp->getSourceRange();
NoteLoc = X->getExprLoc();
NoteRange = X->getSourceRange();
} else if (!X->isInstantiationDependent() ||
!E->isInstantiationDependent()) {
const Expr *NotScalarExpr =
(X->isInstantiationDependent() || X->getType()->isScalarType())
? E
: X;
ErrorFound = NotAScalarType;
ErrorLoc = AtomicBinOp->getExprLoc();
ErrorRange = AtomicBinOp->getSourceRange();
NoteLoc = NotScalarExpr->getExprLoc();
NoteRange = NotScalarExpr->getSourceRange();
} else if (!AtomicBody->isInstantiationDependent()) {
ErrorFound = NotAnAssignmentOp;
ErrorLoc = AtomicBody->getExprLoc();
ErrorRange = AtomicBody->getSourceRange();
NoteLoc = AtomicBinOp ? AtomicBinOp->getOperatorLoc()
: AtomicBody->getExprLoc();
NoteRange = AtomicBinOp ? AtomicBinOp->getSourceRange()
: AtomicBody->getSourceRange();
} else {
ErrorFound = NotAnExpression;
NoteLoc = ErrorLoc = Body->getBeginLoc();
NoteRange = ErrorRange = SourceRange(NoteLoc, NoteLoc);
if (ErrorFound != NoError) {
Diag(ErrorLoc, diag::err_omp_atomic_write_not_expression_statement)
<< ErrorRange;
Diag(NoteLoc, diag::note_omp_atomic_read_write) << ErrorFound
<< NoteRange;
return StmtError();
if (CurContext->isDependentContext())
E = X = nullptr;
} else if (AtomicKind == OMPC_update || AtomicKind == OMPC_unknown) {
// If clause is update:
// x++;
// x--;
// ++x;
// --x;
// x binop= expr;
// x = x binop expr;
// x = expr binop x;
OpenMPAtomicUpdateChecker Checker(*this);
if (Checker.checkStatement(
Body, (AtomicKind == OMPC_update)
? diag::err_omp_atomic_update_not_expression_statement
: diag::err_omp_atomic_not_expression_statement,
return StmtError();
if (!CurContext->isDependentContext()) {
E = Checker.getExpr();
X = Checker.getX();
UE = Checker.getUpdateExpr();
IsXLHSInRHSPart = Checker.isXLHSInRHSPart();
} else if (AtomicKind == OMPC_capture) {
enum {
} ErrorFound = NoError;
SourceLocation ErrorLoc, NoteLoc;
SourceRange ErrorRange, NoteRange;
if (const auto *AtomicBody = dyn_cast<Expr>(Body)) {
// If clause is a capture:
// v = x++;
// v = x--;
// v = ++x;
// v = --x;
// v = x binop= expr;
// v = x = x binop expr;
// v = x = expr binop x;
const auto *AtomicBinOp =
if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
V = AtomicBinOp->getLHS();
Body = AtomicBinOp->getRHS()->IgnoreParenImpCasts();
OpenMPAtomicUpdateChecker Checker(*this);
if (Checker.checkStatement(
Body, diag::err_omp_atomic_capture_not_expression_statement,
return StmtError();
E = Checker.getExpr();
X = Checker.getX();
UE = Checker.getUpdateExpr();
IsXLHSInRHSPart = Checker.isXLHSInRHSPart();
IsPostfixUpdate = Checker.isPostfixUpdate();
} else if (!AtomicBody->isInstantiationDependent()) {
ErrorLoc = AtomicBody->getExprLoc();
ErrorRange = AtomicBody->getSourceRange();
NoteLoc = AtomicBinOp ? AtomicBinOp->getOperatorLoc()
: AtomicBody->getExprLoc();
NoteRange = AtomicBinOp ? AtomicBinOp->getSourceRange()
: AtomicBody->getSourceRange();
ErrorFound = NotAnAssignmentOp;
if (ErrorFound != NoError) {
Diag(ErrorLoc, diag::err_omp_atomic_capture_not_expression_statement)
<< ErrorRange;
Diag(NoteLoc, diag::note_omp_atomic_capture) << ErrorFound << NoteRange;
return StmtError();
if (CurContext->isDependentContext())
UE = V = E = X = nullptr;
} else {
// If clause is a capture:
// { v = x; x = expr; }
// { v = x; x++; }
// { v = x; x--; }
// { v = x; ++x; }
// { v = x; --x; }
// { v = x; x binop= expr; }
// { v = x; x = x binop expr; }
// { v = x; x = expr binop x; }
// { x++; v = x; }
// { x--; v = x; }
// { ++x; v = x; }
// { --x; v = x; }
// { x binop= expr; v = x; }
// { x = x binop expr; v = x; }
// { x = expr binop x; v = x; }
if (auto *CS = dyn_cast<CompoundStmt>(Body)) {
// Check that this is { expr1; expr2; }
if (CS->size() == 2) {
Stmt *First = CS->body_front();
Stmt *Second = CS->body_back();
if (auto *EWC = dyn_cast<ExprWithCleanups>(First))
First = EWC->getSubExpr()->IgnoreParenImpCasts();
if (auto *EWC = dyn_cast<ExprWithCleanups>(Second))
Second = EWC->getSubExpr()->IgnoreParenImpCasts();
// Need to find what subexpression is 'v' and what is 'x'.
OpenMPAtomicUpdateChecker Checker(*this);
bool IsUpdateExprFound = !Checker.checkStatement(Second);
BinaryOperator *BinOp = nullptr;
if (IsUpdateExprFound) {
BinOp = dyn_cast<BinaryOperator>(First);
IsUpdateExprFound = BinOp && BinOp->getOpcode() == BO_Assign;
if (IsUpdateExprFound && !CurContext->isDependentContext()) {
// { v = x; x++; }
// { v = x; x--; }
// { v = x; ++x; }
// { v = x; --x; }
// { v = x; x binop= expr; }
// { v = x; x = x binop expr; }
// { v = x; x = expr binop x; }
// Check that the first expression has form v = x.
Expr *PossibleX = BinOp->getRHS()->IgnoreParenImpCasts();
llvm::FoldingSetNodeID XId, PossibleXId;
Checker.getX()->Profile(XId, Context, /*Canonical=*/true);
PossibleX->Profile(PossibleXId, Context, /*Canonical=*/true);
IsUpdateExprFound = XId == PossibleXId;
if (IsUpdateExprFound) {
V = BinOp->getLHS();
X = Checker.getX();
E = Checker.getExpr();
UE = Checker.getUpdateExpr();
IsXLHSInRHSPart = Checker.isXLHSInRHSPart();
IsPostfixUpdate = true;
if (!IsUpdateExprFound) {
IsUpdateExprFound = !Checker.checkStatement(First);
BinOp = nullptr;
if (IsUpdateExprFound) {
BinOp = dyn_cast<BinaryOperator>(Second);
IsUpdateExprFound = BinOp && BinOp->getOpcode() == BO_Assign;
if (IsUpdateExprFound && !CurContext->isDependentContext()) {
// { x++; v = x; }
// { x--; v = x; }
// { ++x; v = x; }
// { --x; v = x; }
// { x binop= expr; v = x; }
// { x = x binop expr; v = x; }
// { x = expr binop x; v = x; }
// Check that the second expression has form v = x.
Expr *PossibleX = BinOp->getRHS()->IgnoreParenImpCasts();
llvm::FoldingSetNodeID XId, PossibleXId;
Checker.getX()->Profile(XId, Context, /*Canonical=*/true);
PossibleX->Profile(PossibleXId, Context, /*Canonical=*/true);
IsUpdateExprFound = XId == PossibleXId;
if (IsUpdateExprFound) {
V = BinOp->getLHS();
X = Checker.getX();
E = Checker.getExpr();
UE = Checker.getUpdateExpr();
IsXLHSInRHSPart = Checker.isXLHSInRHSPart();
IsPostfixUpdate = false;
if (!IsUpdateExprFound) {
// { v = x; x = expr; }
auto *FirstExpr = dyn_cast<Expr>(First);
auto *SecondExpr = dyn_cast<Expr>(Second);
if (!FirstExpr || !SecondExpr ||
!(FirstExpr->isInstantiationDependent() ||
SecondExpr->isInstantiationDependent())) {
auto *FirstBinOp = dyn_cast<BinaryOperator>(First);
if (!FirstBinOp || FirstBinOp->getOpcode() != BO_Assign) {
ErrorFound = NotAnAssignmentOp;
NoteLoc = ErrorLoc = FirstBinOp ? FirstBinOp->getOperatorLoc()
: First->getBeginLoc();
NoteRange = ErrorRange = FirstBinOp
? FirstBinOp->getSourceRange()
: SourceRange(ErrorLoc, ErrorLoc);
} else {
auto *SecondBinOp = dyn_cast<BinaryOperator>(Second);
if (!SecondBinOp || SecondBinOp->getOpcode() != BO_Assign) {
ErrorFound = NotAnAssignmentOp;
NoteLoc = ErrorLoc = SecondBinOp
? SecondBinOp->getOperatorLoc()
: Second->getBeginLoc();
NoteRange = ErrorRange =
SecondBinOp ? SecondBinOp->getSourceRange()
: SourceRange(ErrorLoc, ErrorLoc);
} else {
Expr *PossibleXRHSInFirst =
Expr *PossibleXLHSInSecond =
llvm::FoldingSetNodeID X1Id, X2Id;
PossibleXRHSInFirst->Profile(X1Id, Context,
PossibleXLHSInSecond->Profile(X2Id, Context,
IsUpdateExprFound = X1Id == X2Id;
if (IsUpdateExprFound) {
V = FirstBinOp->getLHS();
X = SecondBinOp->getLHS();
E = SecondBinOp->getRHS();
UE = nullptr;
IsXLHSInRHSPart = false;
IsPostfixUpdate = true;
} else {
ErrorFound = NotASpecificExpression;
ErrorLoc = FirstBinOp->getExprLoc();
ErrorRange = FirstBinOp->getSourceRange();
NoteLoc = SecondBinOp->getLHS()->getExprLoc();
NoteRange = SecondBinOp->getRHS()->getSourceRange();
} else {
NoteLoc = ErrorLoc = Body->getBeginLoc();
NoteRange = ErrorRange =
SourceRange(Body->getBeginLoc(), Body->getBeginLoc());
ErrorFound = NotTwoSubstatements;
} else {
NoteLoc = ErrorLoc = Body->getBeginLoc();
NoteRange = ErrorRange =
SourceRange(Body->getBeginLoc(), Body->getBeginLoc());
ErrorFound = NotACompoundStatement;
if (ErrorFound != NoError) {
Diag(ErrorLoc, diag::err_omp_atomic_capture_not_compound_statement)
<< ErrorRange;
Diag(NoteLoc, diag::note_omp_atomic_capture) << ErrorFound << NoteRange;
return StmtError();
if (CurContext->isDependentContext())
UE = V = E = X = nullptr;
return OMPAtomicDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
StmtResult Sema::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
// OpenMP [2.16, Nesting of Regions]
// If specified, a teams construct must be contained within a target
// construct. That target construct must contain no statements or directives
// outside of the teams construct.
if (DSAStack->hasInnerTeamsRegion()) {
const Stmt *S = CS->IgnoreContainers(/*IgnoreCaptured=*/true);
bool OMPTeamsFound = true;
if (const auto *CS = dyn_cast<CompoundStmt>(S)) {
auto I = CS->body_begin();
while (I != CS->body_end()) {
const auto *OED = dyn_cast<OMPExecutableDirective>(*I);
if (!OED || !isOpenMPTeamsDirective(OED->getDirectiveKind()) ||
OMPTeamsFound) {
OMPTeamsFound = false;
assert(I != CS->body_end() && "Not found statement");
S = *I;
} else {
const auto *OED = dyn_cast<OMPExecutableDirective>(S);
OMPTeamsFound = OED && isOpenMPTeamsDirective(OED->getDirectiveKind());
if (!OMPTeamsFound) {
Diag(StartLoc, diag::err_omp_target_contains_not_only_teams);
Diag(S->getBeginLoc(), diag::note_omp_nested_statement_here)
<< isa<OMPExecutableDirective>(S);
return StmtError();
return OMPTargetDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
Sema::ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
return OMPTargetParallelDirective::Create(
Context, StartLoc, EndLoc, Clauses, AStmt,
DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
StmtResult Sema::ActOnOpenMPTargetParallelForDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel_for);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_target_parallel_for, getCollapseNumberExpr(Clauses),
getOrderedNumberExpr(Clauses), CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp target parallel for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
return OMPTargetParallelForDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
/// Check for existence of a map clause in the list of clauses.
static bool hasClauses(ArrayRef<OMPClause *> Clauses,
const OpenMPClauseKind K) {
return llvm::any_of(
Clauses, [K](const OMPClause *C) { return C->getClauseKind() == K; });
template <typename... Params>
static bool hasClauses(ArrayRef<OMPClause *> Clauses, const OpenMPClauseKind K,
const Params... ClauseTypes) {
return hasClauses(Clauses, K) || hasClauses(Clauses, ClauseTypes...);
StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
// OpenMP [2.12.2, target data Construct, Restrictions]
// At least one map, use_device_addr or use_device_ptr clause must appear on
// the directive.
if (!hasClauses(Clauses, OMPC_map, OMPC_use_device_ptr) &&
(LangOpts.OpenMP < 50 || !hasClauses(Clauses, OMPC_use_device_addr))) {
StringRef Expected;
if (LangOpts.OpenMP < 50)
Expected = "'map' or 'use_device_ptr'";
Expected = "'map', 'use_device_ptr', or 'use_device_addr'";
Diag(StartLoc, diag::err_omp_no_clause_for_directive)
<< Expected << getOpenMPDirectiveName(OMPD_target_data);
return StmtError();
return OMPTargetDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
SourceLocation StartLoc,
SourceLocation EndLoc, Stmt *AStmt) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_enter_data);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
// OpenMP [2.10.2, Restrictions, p. 99]
// At least one map clause must appear on the directive.
if (!hasClauses(Clauses, OMPC_map)) {
Diag(StartLoc, diag::err_omp_no_clause_for_directive)
<< "'map'" << getOpenMPDirectiveName(OMPD_target_enter_data);
return StmtError();
return OMPTargetEnterDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
SourceLocation StartLoc,
SourceLocation EndLoc, Stmt *AStmt) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_exit_data);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
// OpenMP [2.10.3, Restrictions, p. 102]
// At least one map clause must appear on the directive.
if (!hasClauses(Clauses, OMPC_map)) {
Diag(StartLoc, diag::err_omp_no_clause_for_directive)
<< "'map'" << getOpenMPDirectiveName(OMPD_target_exit_data);
return StmtError();
return OMPTargetExitDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
SourceLocation StartLoc,
SourceLocation EndLoc,
Stmt *AStmt) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_update);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
if (!hasClauses(Clauses, OMPC_to, OMPC_from)) {
Diag(StartLoc, diag::err_omp_at_least_one_motion_clause_required);
return StmtError();
return OMPTargetUpdateDirective::Create(Context, StartLoc, EndLoc, Clauses,
StmtResult Sema::ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
return OMPTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
Sema::ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc,
SourceLocation EndLoc,
OpenMPDirectiveKind CancelRegion) {
if (DSAStack->isParentNowaitRegion()) {
Diag(StartLoc, diag::err_omp_parent_cancel_region_nowait) << 0;
return StmtError();
if (DSAStack->isParentOrderedRegion()) {
Diag(StartLoc, diag::err_omp_parent_cancel_region_ordered) << 0;
return StmtError();
return OMPCancellationPointDirective::Create(Context, StartLoc, EndLoc,
StmtResult Sema::ActOnOpenMPCancelDirective(ArrayRef<OMPClause *> Clauses,
SourceLocation StartLoc,
SourceLocation EndLoc,
OpenMPDirectiveKind CancelRegion) {
if (DSAStack->isParentNowaitRegion()) {
Diag(StartLoc, diag::err_omp_parent_cancel_region_nowait) << 1;
return StmtError();
if (DSAStack->isParentOrderedRegion()) {
Diag(StartLoc, diag::err_omp_parent_cancel_region_ordered) << 1;
return StmtError();
return OMPCancelDirective::Create(Context, StartLoc, EndLoc, Clauses,
static bool checkGrainsizeNumTasksClauses(Sema &S,
ArrayRef<OMPClause *> Clauses) {
const OMPClause *PrevClause = nullptr;
bool ErrorFound = false;
for (const OMPClause *C : Clauses) {
if (C->getClauseKind() == OMPC_grainsize ||
C->getClauseKind() == OMPC_num_tasks) {
if (!PrevClause)
PrevClause = C;
else if (PrevClause->getClauseKind() != C->getClauseKind()) {
S.Diag(C->getBeginLoc(), diag::err_omp_clauses_mutually_exclusive)
<< getOpenMPClauseName(C->getClauseKind())
<< getOpenMPClauseName(PrevClause->getClauseKind());
S.Diag(PrevClause->getBeginLoc(), diag::note_omp_previous_clause)
<< getOpenMPClauseName(PrevClause->getClauseKind());
ErrorFound = true;
return ErrorFound;
static bool checkReductionClauseWithNogroup(Sema &S,
ArrayRef<OMPClause *> Clauses) {
const OMPClause *ReductionClause = nullptr;
const OMPClause *NogroupClause = nullptr;
for (const OMPClause *C : Clauses) {
if (C->getClauseKind() == OMPC_reduction) {
ReductionClause = C;
if (NogroupClause)
if (C->getClauseKind() == OMPC_nogroup) {
NogroupClause = C;
if (ReductionClause)
if (ReductionClause && NogroupClause) {
S.Diag(ReductionClause->getBeginLoc(), diag::err_omp_reduction_with_nogroup)
<< SourceRange(NogroupClause->getBeginLoc(),
return true;
return false;
StmtResult Sema::ActOnOpenMPTaskLoopDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_taskloop, getCollapseNumberExpr(Clauses),
/*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// The grainsize clause and num_tasks clause are mutually exclusive and may
// not appear on the same taskloop directive.
if (checkGrainsizeNumTasksClauses(*this, Clauses))
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// If a reduction clause is present on the taskloop directive, the nogroup
// clause must not be specified.
if (checkReductionClauseWithNogroup(*this, Clauses))
return StmtError();
return OMPTaskLoopDirective::Create(Context, StartLoc, EndLoc,
NestedLoopCount, Clauses, AStmt, B,
StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_taskloop_simd, getCollapseNumberExpr(Clauses),
/*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// The grainsize clause and num_tasks clause are mutually exclusive and may
// not appear on the same taskloop directive.
if (checkGrainsizeNumTasksClauses(*this, Clauses))
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// If a reduction clause is present on the taskloop directive, the nogroup
// clause must not be specified.
if (checkReductionClauseWithNogroup(*this, Clauses))
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPTaskLoopSimdDirective::Create(Context, StartLoc, EndLoc,
NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPMasterTaskLoopDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_master_taskloop, getCollapseNumberExpr(Clauses),
/*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// The grainsize clause and num_tasks clause are mutually exclusive and may
// not appear on the same taskloop directive.
if (checkGrainsizeNumTasksClauses(*this, Clauses))
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// If a reduction clause is present on the taskloop directive, the nogroup
// clause must not be specified.
if (checkReductionClauseWithNogroup(*this, Clauses))
return StmtError();
return OMPMasterTaskLoopDirective::Create(Context, StartLoc, EndLoc,
NestedLoopCount, Clauses, AStmt, B,
StmtResult Sema::ActOnOpenMPMasterTaskLoopSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_master_taskloop_simd, getCollapseNumberExpr(Clauses),
/*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// The grainsize clause and num_tasks clause are mutually exclusive and may
// not appear on the same taskloop directive.
if (checkGrainsizeNumTasksClauses(*this, Clauses))
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// If a reduction clause is present on the taskloop directive, the nogroup
// clause must not be specified.
if (checkReductionClauseWithNogroup(*this, Clauses))
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPMasterTaskLoopSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_parallel_master_taskloop, getCollapseNumberExpr(Clauses),
/*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// The grainsize clause and num_tasks clause are mutually exclusive and may
// not appear on the same taskloop directive.
if (checkGrainsizeNumTasksClauses(*this, Clauses))
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// If a reduction clause is present on the taskloop directive, the nogroup
// clause must not be specified.
if (checkReductionClauseWithNogroup(*this, Clauses))
return StmtError();
return OMPParallelMasterTaskLoopDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_parallel_master_taskloop_simd, getCollapseNumberExpr(Clauses),
/*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// The grainsize clause and num_tasks clause are mutually exclusive and may
// not appear on the same taskloop directive.
if (checkGrainsizeNumTasksClauses(*this, Clauses))
return StmtError();
// OpenMP, [2.9.2 taskloop Construct, Restrictions]
// If a reduction clause is present on the taskloop directive, the nogroup
// clause must not be specified.
if (checkReductionClauseWithNogroup(*this, Clauses))
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPParallelMasterTaskLoopSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPDistributeDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_distribute, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, AStmt,
*this, *DSAStack, VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
return OMPDistributeDirective::Create(Context, StartLoc, EndLoc,
NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPDistributeParallelForDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_distribute_parallel_for, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
return OMPDistributeParallelForDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
StmtResult Sema::ActOnOpenMPDistributeParallelForSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPDistributeParallelForSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPDistributeSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_distribute_simd);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_distribute_simd, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this,
*DSAStack, VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPDistributeSimdDirective::Create(Context, StartLoc, EndLoc,
NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel_for);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' or 'ordered' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_target_parallel_for_simd, getCollapseNumberExpr(Clauses),
getOrderedNumberExpr(Clauses), CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp target parallel for simd loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPTargetParallelForSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPTargetSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_simd);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will define the
// nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_target_simd, getCollapseNumberExpr(Clauses),
getOrderedNumberExpr(Clauses), CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp target simd loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPTargetSimdDirective::Create(Context, StartLoc, EndLoc,
NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPTeamsDistributeDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_teams_distribute);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
checkOpenMPLoop(OMPD_teams_distribute, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this,
*DSAStack, VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp teams distribute loop exprs were not built");
return OMPTeamsDistributeDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPTeamsDistributeSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_teams_distribute_simd, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp teams distribute simd loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPTeamsDistributeSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPTeamsDistributeParallelForSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp for loop exprs were not built");
return OMPTeamsDistributeParallelForDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef<OMPClause *> Clauses,
Stmt *AStmt,
SourceLocation StartLoc,
SourceLocation EndLoc) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_teams);
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses,
StmtResult Sema::ActOnOpenMPTargetTeamsDistributeDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_target_teams_distribute, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp target teams distribute loop exprs were not built");
return OMPTargetTeamsDistributeDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_target_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp target teams distribute parallel for loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
return OMPTargetTeamsDistributeParallelForDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel = getOpenMPCaptureLevels(
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount =
nullptr /*ordered not a clause on distribute*/, CS, *this,
*DSAStack, VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp target teams distribute parallel for simd loop exprs were not "
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPTargetTeamsDistributeParallelForSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
StmtResult Sema::ActOnOpenMPTargetTeamsDistributeSimdDirective(
ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
if (!AStmt)
return StmtError();
auto *CS = cast<CapturedStmt>(AStmt);
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
for (int ThisCaptureLevel =
ThisCaptureLevel > 1; --ThisCaptureLevel) {
CS = cast<CapturedStmt>(CS->getCapturedStmt());
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
OMPLoopDirective::HelperExprs B;
// In presence of clause 'collapse' with number of loops, it will
// define the nested loops number.
unsigned NestedLoopCount = checkOpenMPLoop(
OMPD_target_teams_distribute_simd, getCollapseNumberExpr(Clauses),
nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
VarsWithImplicitDSA, B);
if (NestedLoopCount == 0)
return StmtError();
assert((CurContext->isDependentContext() || B.builtAll()) &&
"omp target teams distribute simd loop exprs were not built");
if (!CurContext->isDependentContext()) {
// Finalize the clauses that need pre-built expressions for CodeGen.
for (OMPClause *C : Clauses) {
if (auto *LC = dyn_cast<OMPLinearClause>(C))
if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
B.NumIterations, *this, CurScope,
return StmtError();
if (checkSimdlenSafelenSpecified(*this, Clauses))
return StmtError();
return OMPTargetTeamsDistributeSimdDirective::Create(
Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
OMPClause *Res = nullptr;
switch (Kind) {
case OMPC_final:
Res = ActOnOpenMPFinalClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_num_threads:
Res = ActOnOpenMPNumThreadsClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_safelen:
Res = ActOnOpenMPSafelenClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_simdlen:
Res = ActOnOpenMPSimdlenClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_allocator:
Res = ActOnOpenMPAllocatorClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_collapse:
Res = ActOnOpenMPCollapseClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_ordered:
Res = ActOnOpenMPOrderedClause(StartLoc, EndLoc, LParenLoc, Expr);
case OMPC_num_teams:
Res = ActOnOpenMPNumTeamsClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_thread_limit:
Res = ActOnOpenMPThreadLimitClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_priority:
Res = ActOnOpenMPPriorityClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_grainsize:
Res = ActOnOpenMPGrainsizeClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_num_tasks:
Res = ActOnOpenMPNumTasksClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_hint:
Res = ActOnOpenMPHintClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_depobj:
Res = ActOnOpenMPDepobjClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_detach:
Res = ActOnOpenMPDetachClause(Expr, StartLoc, LParenLoc, EndLoc);
case OMPC_device:
case OMPC_if:
case OMPC_default:
case OMPC_proc_bind:
case OMPC_schedule:
case OMPC_private:
case OMPC_firstprivate:
case OMPC_lastprivate:
case OMPC_shared:
case OMPC_reduction:
case OMPC_task_reduction:
case OMPC_in_reduction:
case OMPC_linear:
case OMPC_aligned:
case OMPC_copyin:
case OMPC_copyprivate:
case OMPC_nowait:
case OMPC_untied:
case OMPC_mergeable:
case OMPC_threadprivate:
case OMPC_allocate:
case OMPC_flush:
case OMPC_read:
case OMPC_write:
case OMPC_update:
case OMPC_capture:
case OMPC_seq_cst:
case OMPC_acq_rel:
case OMPC_acquire:
case OMPC_release:
case OMPC_relaxed:
case OMPC_depend:
case OMPC_threads:
case OMPC_simd:
case OMPC_map:
case OMPC_nogroup:
case OMPC_dist_schedule:
case OMPC_defaultmap:
case OMPC_unknown:
case OMPC_uniform:
case OMPC_to:
case OMPC_from:
case OMPC_use_device_ptr:
case OMPC_use_device_addr:
case OMPC_is_device_ptr:
case OMPC_unified_address:
case OMPC_unified_shared_memory:
case OMPC_reverse_offload:
case OMPC_dynamic_allocators:
case OMPC_atomic_default_mem_order:
case OMPC_device_type:
case OMPC_match:
case OMPC_nontemporal:
case OMPC_order:
case OMPC_destroy:
case OMPC_inclusive:
case OMPC_exclusive:
case OMPC_uses_allocators:
case OMPC_affinity:
llvm_unreachable("Clause is not allowed.");
return Res;
// An OpenMP directive such as 'target parallel' has two captured regions:
// for the 'target' and 'parallel' respectively. This function returns
// the region in which to capture expressions associated with a clause.
// A return value of OMPD_unknown signifies that the expression should not
// be captured.
static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
OpenMPDirectiveKind DKind, OpenMPClauseKind CKind, unsigned OpenMPVersion,
OpenMPDirectiveKind NameModifier = OMPD_unknown) {
OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
switch (CKind) {
case OMPC_if:
switch (DKind) {
case OMPD_target_parallel_for_simd:
if (OpenMPVersion >= 50 &&
(NameModifier == OMPD_unknown || NameModifier == OMPD_simd)) {
CaptureRegion = OMPD_parallel;
case OMPD_target_parallel:
case OMPD_target_parallel_for:
// If this clause applies to the nested 'parallel' region, capture within
// the 'target' region, otherwise do not capture.
if (NameModifier == OMPD_unknown || NameModifier == OMPD_parallel)
CaptureRegion = OMPD_target;
case OMPD_target_teams_distribute_parallel_for_simd:
if (OpenMPVersion >= 50 &&
(NameModifier == OMPD_unknown || NameModifier == OMPD_simd)) {
CaptureRegion = OMPD_parallel;
case OMPD_target_teams_distribute_parallel_for:
// If this clause applies to the nested 'parallel' region, capture within
// the 'teams' region, otherwise do not capture.
if (NameModifier == OMPD_unknown || NameModifier == OMPD_parallel)
CaptureRegion = OMPD_teams;
case OMPD_teams_distribute_parallel_for_simd:
if (OpenMPVersion >= 50 &&
(NameModifier == OMPD_unknown || NameModifier == OMPD_simd)) {
CaptureRegion = OMPD_parallel;
case OMPD_teams_distribute_parallel_for:
CaptureRegion = OMPD_teams;
case OMPD_target_update:
case OMPD_target_enter_data:
case OMPD_target_exit_data:
CaptureRegion = OMPD_task;
case OMPD_parallel_master_taskloop:
if (NameModifier == OMPD_unknown || NameModifier == OMPD_taskloop)
CaptureRegion = OMPD_parallel;
case OMPD_parallel_master_taskloop_simd:
if ((OpenMPVersion <= 45 && NameModifier == OMPD_unknown) ||
NameModifier == OMPD_taskloop) {
CaptureRegion = OMPD_parallel;
if (OpenMPVersion <= 45)
if (NameModifier == OMPD_unknown || NameModifier == OMPD_simd)
CaptureRegion = OMPD_taskloop;
case OMPD_parallel_for_simd:
if (OpenMPVersion <= 45)
if (NameModifier == OMPD_unknown || NameModifier == OMPD_simd)
CaptureRegion = OMPD_parallel;
case OMPD_taskloop_simd:
case OMPD_master_taskloop_simd:
if (OpenMPVersion <= 45)
if (NameModifier == OMPD_unknown || NameModifier == OMPD_simd)
CaptureRegion = OMPD_taskloop;
case OMPD_distribute_parallel_for_simd:
if (OpenMPVersion <= 45)
if (NameModifier == OMPD_unknown || NameModifier == OMPD_simd)
CaptureRegion = OMPD_parallel;
case OMPD_target_simd:
if (OpenMPVersion >= 50 &&
(NameModifier == OMPD_unknown || NameModifier == OMPD_simd))
CaptureRegion = OMPD_target;
case OMPD_teams_distribute_simd:
case OMPD_target_teams_distribute_simd:
if (OpenMPVersion >= 50 &&
(NameModifier == OMPD_unknown || NameModifier == OMPD_simd))
CaptureRegion = OMPD_teams;
case OMPD_cancel:
case OMPD_parallel:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_parallel_for:
case OMPD_target:
case OMPD_target_teams:
case OMPD_target_teams_distribute:
case OMPD_distribute_parallel_for:
case OMPD_task:
case OMPD_taskloop:
case OMPD_master_taskloop:
case OMPD_target_data:
case OMPD_simd:
case OMPD_for_simd:
case OMPD_distribute_simd:
// Do not capture if-clause expressions.
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_teams:
case OMPD_for:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_distribute:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_teams_distribute:
case OMPD_requires:
llvm_unreachable("Unexpected OpenMP directive with if-clause");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
case OMPC_num_threads:
switch (DKind) {
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
CaptureRegion = OMPD_target;
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
CaptureRegion = OMPD_teams;
case OMPD_parallel:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
// Do not capture num_threads-clause expressions.
case OMPD_target_data:
case OMPD_target_enter_data:
case OMPD_target_exit_data:
case OMPD_target_update:
case OMPD_target:
case OMPD_target_simd:
case OMPD_target_teams:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_cancel:
case OMPD_task:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_teams:
case OMPD_simd:
case OMPD_for:
case OMPD_for_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_distribute:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_distribute_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_requires:
llvm_unreachable("Unexpected OpenMP directive with num_threads-clause");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
case OMPC_num_teams:
switch (DKind) {
case OMPD_target_teams:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
CaptureRegion = OMPD_target;
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_teams:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
// Do not capture num_teams-clause expressions.
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_task:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_target_data:
case OMPD_target_enter_data:
case OMPD_target_exit_data:
case OMPD_target_update:
case OMPD_cancel:
case OMPD_parallel:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_target:
case OMPD_target_simd:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_simd:
case OMPD_for:
case OMPD_for_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_distribute:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_distribute_simd:
case OMPD_requires:
llvm_unreachable("Unexpected OpenMP directive with num_teams-clause");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
case OMPC_thread_limit:
switch (DKind) {
case OMPD_target_teams:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
CaptureRegion = OMPD_target;
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_teams:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
// Do not capture thread_limit-clause expressions.
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_task:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_target_data:
case OMPD_target_enter_data:
case OMPD_target_exit_data:
case OMPD_target_update:
case OMPD_cancel:
case OMPD_parallel:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_target:
case OMPD_target_simd:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_simd:
case OMPD_for:
case OMPD_for_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_distribute:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_distribute_simd:
case OMPD_requires:
llvm_unreachable("Unexpected OpenMP directive with thread_limit-clause");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
case OMPC_schedule:
switch (DKind) {
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
CaptureRegion = OMPD_parallel;
case OMPD_for:
case OMPD_for_simd:
// Do not capture schedule-clause expressions.
case OMPD_task:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_target_data:
case OMPD_target_enter_data:
case OMPD_target_exit_data:
case OMPD_target_update:
case OMPD_teams:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_target:
case OMPD_target_simd:
case OMPD_target_parallel:
case OMPD_cancel:
case OMPD_parallel:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_distribute:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_distribute_simd:
case OMPD_target_teams:
case OMPD_requires:
llvm_unreachable("Unexpected OpenMP directive with schedule clause");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
case OMPC_dist_schedule:
switch (DKind) {
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
CaptureRegion = OMPD_teams;
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_distribute:
case OMPD_distribute_simd:
// Do not capture thread_limit-clause expressions.
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_target_parallel_for_simd:
case OMPD_target_parallel_for:
case OMPD_task:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_target_data:
case OMPD_target_enter_data:
case OMPD_target_exit_data:
case OMPD_target_update:
case OMPD_teams:
case OMPD_target:
case OMPD_target_simd:
case OMPD_target_parallel:
case OMPD_cancel:
case OMPD_parallel:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_simd:
case OMPD_for:
case OMPD_for_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_target_teams:
case OMPD_requires:
llvm_unreachable("Unexpected OpenMP directive with schedule clause");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
case OMPC_device:
switch (DKind) {
case OMPD_target_update:
case OMPD_target_enter_data:
case OMPD_target_exit_data:
case OMPD_target:
case OMPD_target_simd:
case OMPD_target_teams:
case OMPD_target_parallel:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
CaptureRegion = OMPD_task;
case OMPD_target_data:
// Do not capture device-clause expressions.
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_teams:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_task:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_cancel:
case OMPD_parallel:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_simd:
case OMPD_for:
case OMPD_for_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_distribute:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_distribute_simd:
case OMPD_requires:
llvm_unreachable("Unexpected OpenMP directive with num_teams-clause");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
case OMPC_grainsize:
case OMPC_num_tasks:
case OMPC_final:
case OMPC_priority:
switch (DKind) {
case OMPD_task:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
CaptureRegion = OMPD_parallel;
case OMPD_target_update:
case OMPD_target_enter_data:
case OMPD_target_exit_data:
case OMPD_target:
case OMPD_target_simd:
case OMPD_target_teams:
case OMPD_target_parallel:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_target_data:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_teams:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_cancel:
case OMPD_parallel:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_cancellation_point:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_simd:
case OMPD_for:
case OMPD_for_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskgroup:
case OMPD_distribute:
case OMPD_ordered:
case OMPD_atomic:
case OMPD_distribute_simd:
case OMPD_requires:
llvm_unreachable("Unexpected OpenMP directive with grainsize-clause");
case OMPD_unknown:
llvm_unreachable("Unknown OpenMP directive");
case OMPC_firstprivate:
case OMPC_lastprivate:
case OMPC_reduction:
case OMPC_task_reduction:
case OMPC_in_reduction:
case OMPC_linear:
case OMPC_default:
case OMPC_proc_bind:
case OMPC_safelen:
case OMPC_simdlen:
case OMPC_allocator:
case OMPC_collapse:
case OMPC_private:
case OMPC_shared:
case OMPC_aligned:
case OMPC_copyin:
case OMPC_copyprivate:
case OMPC_ordered:
case OMPC_nowait:
case OMPC_untied:
case OMPC_mergeable:
case OMPC_threadprivate:
case OMPC_allocate:
case OMPC_flush:
case OMPC_depobj:
case OMPC_read:
case OMPC_write:
case OMPC_update:
case OMPC_capture:
case OMPC_seq_cst:
case OMPC_acq_rel:
case OMPC_acquire:
case OMPC_release:
case OMPC_relaxed:
case OMPC_depend:
case OMPC_threads:
case OMPC_simd:
case OMPC_map:
case OMPC_nogroup:
case OMPC_hint:
case OMPC_defaultmap:
case OMPC_unknown:
case OMPC_uniform:
case OMPC_to:
case OMPC_from:
case OMPC_use_device_ptr:
case OMPC_use_device_addr:
case OMPC_is_device_ptr:
case OMPC_unified_address:
case OMPC_unified_shared_memory:
case OMPC_reverse_offload:
case OMPC_dynamic_allocators:
case OMPC_atomic_default_mem_order:
case OMPC_device_type:
case OMPC_match:
case OMPC_nontemporal:
case OMPC_order:
case OMPC_destroy:
case OMPC_detach:
case OMPC_inclusive:
case OMPC_exclusive:
case OMPC_uses_allocators:
case OMPC_affinity:
llvm_unreachable("Unexpected OpenMP clause.");
return CaptureRegion;
OMPClause *Sema::ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier,
Expr *Condition, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation NameModifierLoc,
SourceLocation ColonLoc,
SourceLocation EndLoc) {
Expr *ValExpr = Condition;
Stmt *HelperValStmt = nullptr;
OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
!Condition->isInstantiationDependent() &&
!Condition->containsUnexpandedParameterPack()) {
ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
if (Val.isInvalid())
return nullptr;
ValExpr = Val.get();
OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
CaptureRegion = getOpenMPCaptureRegionForClause(
DKind, OMPC_if, LangOpts.OpenMP, NameModifier);
if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
ValExpr = MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
HelperValStmt = buildPreInits(Context, Captures);
return new (Context)
OMPIfClause(NameModifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc,
LParenLoc, NameModifierLoc, ColonLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPFinalClause(Expr *Condition,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
Expr *ValExpr = Condition;
Stmt *HelperValStmt = nullptr;
OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
!Condition->isInstantiationDependent() &&
!Condition->containsUnexpandedParameterPack()) {
ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
if (Val.isInvalid())
return nullptr;
ValExpr = MakeFullExpr(Val.get()).get();
OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
CaptureRegion =
getOpenMPCaptureRegionForClause(DKind, OMPC_final, LangOpts.OpenMP);
if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
ValExpr = MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
HelperValStmt = buildPreInits(Context, Captures);
return new (Context) OMPFinalClause(ValExpr, HelperValStmt, CaptureRegion,
StartLoc, LParenLoc, EndLoc);
ExprResult Sema::PerformOpenMPImplicitIntegerConversion(SourceLocation Loc,
Expr *Op) {
if (!Op)
return ExprError();
class IntConvertDiagnoser : public ICEConvertDiagnoser {
: ICEConvertDiagnoser(/*AllowScopedEnumerations*/ false, false, true) {}
SemaDiagnosticBuilder diagnoseNotInt(Sema &S, SourceLocation Loc,
QualType T) override {
return S.Diag(Loc, diag::err_omp_not_integral) << T;
SemaDiagnosticBuilder diagnoseIncomplete(Sema &S, SourceLocation Loc,
QualType T) override {
return S.Diag(Loc, diag::err_omp_incomplete_type) << T;
SemaDiagnosticBuilder diagnoseExplicitConv(Sema &S, SourceLocation Loc,
QualType T,
QualType ConvTy) override {
return S.Diag(Loc, diag::err_omp_explicit_conversion) << T << ConvTy;
SemaDiagnosticBuilder noteExplicitConv(Sema &S, CXXConversionDecl *Conv,
QualType ConvTy) override {
return S.Diag(Conv->getLocation(), diag::note_omp_conversion_here)
<< ConvTy->isEnumeralType() << ConvTy;
SemaDiagnosticBuilder diagnoseAmbiguous(Sema &S, SourceLocation Loc,
QualType T) override {
return S.Diag(Loc, diag::err_omp_ambiguous_conversion) << T;
SemaDiagnosticBuilder noteAmbiguous(Sema &S, CXXConversionDecl *Conv,
QualType ConvTy) override {
return S.Diag(Conv->getLocation(), diag::note_omp_conversion_here)
<< ConvTy->isEnumeralType() << ConvTy;
SemaDiagnosticBuilder diagnoseConversion(Sema &, SourceLocation, QualType,
QualType) override {
llvm_unreachable("conversion functions are permitted");
} ConvertDiagnoser;
return PerformContextualImplicitConversion(Loc, Op, ConvertDiagnoser);
static bool
isNonNegativeIntegerValue(Expr *&ValExpr, Sema &SemaRef, OpenMPClauseKind CKind,
bool StrictlyPositive, bool BuildCapture = false,
OpenMPDirectiveKind DKind = OMPD_unknown,
OpenMPDirectiveKind *CaptureRegion = nullptr,
Stmt **HelperValStmt = nullptr) {
if (!ValExpr->isTypeDependent() && !ValExpr->isValueDependent() &&
!ValExpr->isInstantiationDependent()) {
SourceLocation Loc = ValExpr->getExprLoc();
ExprResult Value =
SemaRef.PerformOpenMPImplicitIntegerConversion(Loc, ValExpr);
if (Value.isInvalid())
return false;
ValExpr = Value.get();
// The expression must evaluate to a non-negative integer value.
llvm::APSInt Result;
if (ValExpr->isIntegerConstantExpr(Result, SemaRef.Context) &&
Result.isSigned() &&
!((!StrictlyPositive && Result.isNonNegative()) ||
(StrictlyPositive && Result.isStrictlyPositive()))) {
SemaRef.Diag(Loc, diag::err_omp_negative_expression_in_clause)
<< getOpenMPClauseName(CKind) << (StrictlyPositive ? 1 : 0)
<< ValExpr->getSourceRange();
return false;
if (!BuildCapture)
return true;
*CaptureRegion =
getOpenMPCaptureRegionForClause(DKind, CKind, SemaRef.LangOpts.OpenMP);
if (*CaptureRegion != OMPD_unknown &&
!SemaRef.CurContext->isDependentContext()) {
ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
*HelperValStmt = buildPreInits(SemaRef.Context, Captures);
return true;
OMPClause *Sema::ActOnOpenMPNumThreadsClause(Expr *NumThreads,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
Expr *ValExpr = NumThreads;
Stmt *HelperValStmt = nullptr;
// OpenMP [2.5, Restrictions]
// The num_threads expression must evaluate to a positive integer value.
if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_num_threads,
return nullptr;
OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
OpenMPDirectiveKind CaptureRegion =
getOpenMPCaptureRegionForClause(DKind, OMPC_num_threads, LangOpts.OpenMP);
if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
ValExpr = MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
HelperValStmt = buildPreInits(Context, Captures);
return new (Context) OMPNumThreadsClause(
ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E,
OpenMPClauseKind CKind,
bool StrictlyPositive) {
if (!E)
return ExprError();
if (E->isValueDependent() || E->isTypeDependent() ||
E->isInstantiationDependent() || E->containsUnexpandedParameterPack())
return E;
llvm::APSInt Result;
ExprResult ICE = VerifyIntegerConstantExpression(E, &Result);
if (ICE.isInvalid())
return ExprError();
if ((StrictlyPositive && !Result.isStrictlyPositive()) ||
(!StrictlyPositive && !Result.isNonNegative())) {
Diag(E->getExprLoc(), diag::err_omp_negative_expression_in_clause)
<< getOpenMPClauseName(CKind) << (StrictlyPositive ? 1 : 0)
<< E->getSourceRange();
return ExprError();
if (CKind == OMPC_aligned && !Result.isPowerOf2()) {
Diag(E->getExprLoc(), diag::warn_omp_alignment_not_power_of_two)
<< E->getSourceRange();
return ExprError();
if (CKind == OMPC_collapse && DSAStack->getAssociatedLoops() == 1)
else if (CKind == OMPC_ordered)
return ICE;
OMPClause *Sema::ActOnOpenMPSafelenClause(Expr *Len, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
// OpenMP [2.8.1, simd construct, Description]
// The parameter of the safelen clause must be a constant
// positive integer expression.
ExprResult Safelen = VerifyPositiveIntegerConstantInClause(Len, OMPC_safelen);
if (Safelen.isInvalid())
return nullptr;
return new (Context)
OMPSafelenClause(Safelen.get(), StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPSimdlenClause(Expr *Len, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
// OpenMP [2.8.1, simd construct, Description]
// The parameter of the simdlen clause must be a constant
// positive integer expression.
ExprResult Simdlen = VerifyPositiveIntegerConstantInClause(Len, OMPC_simdlen);
if (Simdlen.isInvalid())
return nullptr;
return new (Context)
OMPSimdlenClause(Simdlen.get(), StartLoc, LParenLoc, EndLoc);
/// Tries to find omp_allocator_handle_t type.
static bool findOMPAllocatorHandleT(Sema &S, SourceLocation Loc,
DSAStackTy *Stack) {
QualType OMPAllocatorHandleT = Stack->getOMPAllocatorHandleT();
if (!OMPAllocatorHandleT.isNull())
return true;
// Build the predefined allocator expressions.
bool ErrorFound = false;
for (int I = 0; I < OMPAllocateDeclAttr::OMPUserDefinedMemAlloc; ++I) {
auto AllocatorKind = static_cast<OMPAllocateDeclAttr::AllocatorTypeTy>(I);
StringRef Allocator =
DeclarationName AllocatorName = &S.getASTContext().Idents.get(Allocator);
auto *VD = dyn_cast_or_null<ValueDecl>(
S.LookupSingleName(S.TUScope, AllocatorName, Loc, Sema::LookupAnyName));
if (!VD) {
ErrorFound = true;
QualType AllocatorType =
ExprResult Res = S.BuildDeclRefExpr(VD, AllocatorType, VK_LValue, Loc);
if (!Res.isUsable()) {
ErrorFound = true;
if (OMPAllocatorHandleT.isNull())
OMPAllocatorHandleT = AllocatorType;
if (!S.getASTContext().hasSameType(OMPAllocatorHandleT, AllocatorType)) {
ErrorFound = true;
Stack->setAllocator(AllocatorKind, Res.get());
if (ErrorFound) {
S.Diag(Loc, diag::err_omp_implied_type_not_found)
<< "omp_allocator_handle_t";
return false;
return true;
OMPClause *Sema::ActOnOpenMPAllocatorClause(Expr *A, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
// OpenMP [2.11.3, allocate Directive, Description]
// allocator is an expression of omp_allocator_handle_t type.
if (!findOMPAllocatorHandleT(*this, A->getExprLoc(), DSAStack))
return nullptr;
ExprResult Allocator = DefaultLvalueConversion(A);
if (Allocator.isInvalid())
return nullptr;
Allocator = PerformImplicitConversion(Allocator.get(),
if (Allocator.isInvalid())
return nullptr;
return new (Context)
OMPAllocatorClause(Allocator.get(), StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPCollapseClause(Expr *NumForLoops,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
// OpenMP [2.7.1, loop construct, Description]
// OpenMP [2.8.1, simd construct, Description]
// OpenMP [2.9.6, distribute construct, Description]
// The parameter of the collapse clause must be a constant
// positive integer expression.
ExprResult NumForLoopsResult =
VerifyPositiveIntegerConstantInClause(NumForLoops, OMPC_collapse);
if (NumForLoopsResult.isInvalid())
return nullptr;
return new (Context)
OMPCollapseClause(NumForLoopsResult.get(), StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPOrderedClause(SourceLocation StartLoc,
SourceLocation EndLoc,
SourceLocation LParenLoc,
Expr *NumForLoops) {
// OpenMP [2.7.1, loop construct, Description]
// OpenMP [2.8.1, simd construct, Description]
// OpenMP [2.9.6, distribute construct, Description]
// The parameter of the ordered clause must be a constant
// positive integer expression if any.
if (NumForLoops && LParenLoc.isValid()) {
ExprResult NumForLoopsResult =
VerifyPositiveIntegerConstantInClause(NumForLoops, OMPC_ordered);
if (NumForLoopsResult.isInvalid())
return nullptr;
NumForLoops = NumForLoopsResult.get();
} else {
NumForLoops = nullptr;
auto *Clause = OMPOrderedClause::Create(
Context, NumForLoops, NumForLoops ? DSAStack->getAssociatedLoops() : 0,
StartLoc, LParenLoc, EndLoc);
DSAStack->setOrderedRegion(/*IsOrdered=*/true, NumForLoops, Clause);
return Clause;
OMPClause *Sema::ActOnOpenMPSimpleClause(
OpenMPClauseKind Kind, unsigned Argument, SourceLocation ArgumentLoc,
SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
OMPClause *Res = nullptr;
switch (Kind) {
case OMPC_default:
Res = ActOnOpenMPDefaultClause(static_cast<DefaultKind>(Argument),
ArgumentLoc, StartLoc, LParenLoc, EndLoc);
case OMPC_proc_bind:
Res = ActOnOpenMPProcBindClause(static_cast<ProcBindKind>(Argument),
ArgumentLoc, StartLoc, LParenLoc, EndLoc);
case OMPC_atomic_default_mem_order:
Res = ActOnOpenMPAtomicDefaultMemOrderClause(
ArgumentLoc, StartLoc, LParenLoc, EndLoc);
case OMPC_order:
Res = ActOnOpenMPOrderClause(static_cast<OpenMPOrderClauseKind>(Argument),
ArgumentLoc, StartLoc, LParenLoc, EndLoc);
case OMPC_update:
Res = ActOnOpenMPUpdateClause(static_cast<OpenMPDependClauseKind>(Argument),
ArgumentLoc, StartLoc, LParenLoc, EndLoc);
case OMPC_if:
case OMPC_final:
case OMPC_num_threads:
case OMPC_safelen:
case OMPC_simdlen:
case OMPC_allocator:
case OMPC_collapse:
case OMPC_schedule:
case OMPC_private:
case OMPC_firstprivate:
case OMPC_lastprivate:
case OMPC_shared:
case OMPC_reduction:
case OMPC_task_reduction:
case OMPC_in_reduction:
case OMPC_linear:
case OMPC_aligned:
case OMPC_copyin:
case OMPC_copyprivate:
case OMPC_ordered:
case OMPC_nowait:
case OMPC_untied:
case OMPC_mergeable:
case OMPC_threadprivate:
case OMPC_allocate:
case OMPC_flush:
case OMPC_depobj:
case OMPC_read:
case OMPC_write:
case OMPC_capture:
case OMPC_seq_cst:
case OMPC_acq_rel:
case OMPC_acquire:
case OMPC_release:
case OMPC_relaxed:
case OMPC_depend:
case OMPC_device:
case OMPC_threads:
case OMPC_simd:
case OMPC_map:
case OMPC_num_teams:
case OMPC_thread_limit:
case OMPC_priority:
case OMPC_grainsize:
case OMPC_nogroup:
case OMPC_num_tasks:
case OMPC_hint:
case OMPC_dist_schedule:
case OMPC_defaultmap:
case OMPC_unknown:
case OMPC_uniform:
case OMPC_to:
case OMPC_from:
case OMPC_use_device_ptr:
case OMPC_use_device_addr:
case OMPC_is_device_ptr:
case OMPC_unified_address:
case OMPC_unified_shared_memory:
case OMPC_reverse_offload:
case OMPC_dynamic_allocators:
case OMPC_device_type:
case OMPC_match:
case OMPC_nontemporal:
case OMPC_destroy:
case OMPC_detach:
case OMPC_inclusive:
case OMPC_exclusive:
case OMPC_uses_allocators:
case OMPC_affinity:
llvm_unreachable("Clause is not allowed.");
return Res;
static std::string
getListOfPossibleValues(OpenMPClauseKind K, unsigned First, unsigned Last,
ArrayRef<unsigned> Exclude = llvm::None) {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
unsigned Skipped = Exclude.size();
auto S = Exclude.begin(), E = Exclude.end();
for (unsigned I = First; I < Last; ++I) {
if (std::find(S, E, I) != E) {
Out << "'" << getOpenMPSimpleClauseTypeName(K, I) << "'";
if (I + Skipped + 2 == Last)
Out << " or ";
else if (I + Skipped + 1 != Last)
Out << ", ";
return std::string(Out.str());
OMPClause *Sema::ActOnOpenMPDefaultClause(DefaultKind Kind,
SourceLocation KindKwLoc,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
if (Kind == OMP_DEFAULT_unknown) {
Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
<< getListOfPossibleValues(OMPC_default, /*First=*/0,
<< getOpenMPClauseName(OMPC_default);
return nullptr;
switch (Kind) {
case OMP_DEFAULT_none:
case OMP_DEFAULT_shared:
case OMP_DEFAULT_firstprivate:
llvm_unreachable("DSA unexpected in OpenMP default clause");
return new (Context)
OMPDefaultClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPProcBindClause(ProcBindKind Kind,
SourceLocation KindKwLoc,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
if (Kind == OMP_PROC_BIND_unknown) {
Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
<< getListOfPossibleValues(OMPC_proc_bind,
<< getOpenMPClauseName(OMPC_proc_bind);
return nullptr;
return new (Context)
OMPProcBindClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPAtomicDefaultMemOrderClause(
OpenMPAtomicDefaultMemOrderClauseKind Kind, SourceLocation KindKwLoc,
SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
if (Kind == OMPC_ATOMIC_DEFAULT_MEM_ORDER_unknown) {
Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
<< getListOfPossibleValues(
OMPC_atomic_default_mem_order, /*First=*/0,
<< getOpenMPClauseName(OMPC_atomic_default_mem_order);
return nullptr;
return new (Context) OMPAtomicDefaultMemOrderClause(Kind, KindKwLoc, StartLoc,
LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPOrderClause(OpenMPOrderClauseKind Kind,
SourceLocation KindKwLoc,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
if (Kind == OMPC_ORDER_unknown) {
static_assert(OMPC_ORDER_unknown > 0,
"OMPC_ORDER_unknown not greater than 0");
Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
<< getListOfPossibleValues(OMPC_order, /*First=*/0,
<< getOpenMPClauseName(OMPC_order);
return nullptr;
return new (Context)
OMPOrderClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind,
SourceLocation KindKwLoc,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
if (Kind == OMPC_DEPEND_unknown || Kind == OMPC_DEPEND_source ||
Kind == OMPC_DEPEND_sink || Kind == OMPC_DEPEND_depobj) {
unsigned Except[] = {OMPC_DEPEND_source, OMPC_DEPEND_sink,
Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
<< getListOfPossibleValues(OMPC_depend, /*First=*/0,
/*Last=*/OMPC_DEPEND_unknown, Except)
<< getOpenMPClauseName(OMPC_update);
return nullptr;
return OMPUpdateClause::Create(Context, StartLoc, LParenLoc, KindKwLoc, Kind,
OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause(
OpenMPClauseKind Kind, ArrayRef<unsigned> Argument, Expr *Expr,
SourceLocation StartLoc, SourceLocation LParenLoc,
ArrayRef<SourceLocation> ArgumentLoc, SourceLocation DelimLoc,
SourceLocation EndLoc) {
OMPClause *Res = nullptr;
switch (Kind) {
case OMPC_schedule:
enum { Modifier1, Modifier2, ScheduleKind, NumberOfElements };
assert(Argument.size() == NumberOfElements &&
ArgumentLoc.size() == NumberOfElements);
Res = ActOnOpenMPScheduleClause(
static_cast<OpenMPScheduleClauseKind>(Argument[ScheduleKind]), Expr,
StartLoc, LParenLoc, ArgumentLoc[Modifier1], ArgumentLoc[Modifier2],
ArgumentLoc[ScheduleKind], DelimLoc, EndLoc);
case OMPC_if:
assert(Argument.size() == 1 && ArgumentLoc.size() == 1);
Res = ActOnOpenMPIfClause(static_cast<OpenMPDirectiveKind>(Argument.back()),
Expr, StartLoc, LParenLoc, ArgumentLoc.back(),
DelimLoc, EndLoc);
case OMPC_dist_schedule:
Res = ActOnOpenMPDistScheduleClause(
static_cast<OpenMPDistScheduleClauseKind>(Argument.back()), Expr,
StartLoc, LParenLoc, ArgumentLoc.back(), DelimLoc, EndLoc);
case OMPC_defaultmap:
enum { Modifier, DefaultmapKind };
Res = ActOnOpenMPDefaultmapClause(
StartLoc, LParenLoc, ArgumentLoc[Modifier], ArgumentLoc[DefaultmapKind],
case OMPC_device:
assert(Argument.size() == 1 && ArgumentLoc.size() == 1);
Res = ActOnOpenMPDeviceClause(
static_cast<OpenMPDeviceClauseModifier>(Argument.back()), Expr,
StartLoc, LParenLoc, ArgumentLoc.back(), EndLoc);
case OMPC_final:
case OMPC_num_threads:
case OMPC_safelen:
case OMPC_simdlen:
case OMPC_allocator:
case OMPC_collapse:
case OMPC_default:
case OMPC_proc_bind:
case OMPC_private:
case OMPC_firstprivate:
case OMPC_lastprivate:
case OMPC_shared:
case OMPC_reduction:
case OMPC_task_reduction:
case OMPC_in_reduction:
case OMPC_linear:
case OMPC_aligned:
case OMPC_copyin:
case OMPC_copyprivate:
case OMPC_ordered:
case OMPC_nowait:
case OMPC_untied:
case OMPC_mergeable:
case OMPC_threadprivate:
case OMPC_allocate:
case OMPC_flush:
case OMPC_depobj:
case OMPC_read:
case OMPC_write:
case OMPC_update:
case OMPC_capture:
case OMPC_seq_cst:
case OMPC_acq_rel:
case OMPC_acquire:
case OMPC_release:
case OMPC_relaxed:
case OMPC_depend:
case OMPC_threads:
case OMPC_simd:
case OMPC_map:
case OMPC_num_teams:
case OMPC_thread_limit:
case OMPC_priority:
case OMPC_grainsize:
case OMPC_nogroup:
case OMPC_num_tasks:
case OMPC_hint:
case OMPC_unknown:
case OMPC_uniform:
case OMPC_to:
case OMPC_from:
case OMPC_use_device_ptr:
case OMPC_use_device_addr:
case OMPC_is_device_ptr:
case OMPC_unified_address:
case OMPC_unified_shared_memory:
case OMPC_reverse_offload:
case OMPC_dynamic_allocators:
case OMPC_atomic_default_mem_order:
case OMPC_device_type:
case OMPC_match:
case OMPC_nontemporal:
case OMPC_order:
case OMPC_destroy:
case OMPC_detach:
case OMPC_inclusive:
case OMPC_exclusive:
case OMPC_uses_allocators:
case OMPC_affinity:
llvm_unreachable("Clause is not allowed.");
return Res;
static bool checkScheduleModifiers(Sema &S, OpenMPScheduleClauseModifier M1,
OpenMPScheduleClauseModifier M2,
SourceLocation M1Loc, SourceLocation M2Loc) {
if (M1 == OMPC_SCHEDULE_MODIFIER_unknown && M1Loc.isValid()) {
SmallVector<unsigned, 2> Excluded;
if (M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic)
if (M2 == OMPC_SCHEDULE_MODIFIER_monotonic)
S.Diag(M1Loc, diag::err_omp_unexpected_clause_value)
<< getListOfPossibleValues(OMPC_schedule,
/*First=*/OMPC_SCHEDULE_MODIFIER_unknown + 1,
<< getOpenMPClauseName(OMPC_schedule);
return true;
return false;
OMPClause *Sema::ActOnOpenMPScheduleClause(
OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc,
SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc) {
if (checkScheduleModifiers(*this, M1, M2, M1Loc, M2Loc) ||
checkScheduleModifiers(*this, M2, M1, M2Loc, M1Loc))
return nullptr;
// OpenMP, 2.7.1, Loop Construct, Restrictions
// Either the monotonic modifier or the nonmonotonic modifier can be specified
// but not both.
if ((M1 == M2 && M1 != OMPC_SCHEDULE_MODIFIER_unknown) ||
(M1 == OMPC_SCHEDULE_MODIFIER_monotonic &&
M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic) ||
(M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic &&
M2 == OMPC_SCHEDULE_MODIFIER_monotonic)) {
Diag(M2Loc, diag::err_omp_unexpected_schedule_modifier)
<< getOpenMPSimpleClauseTypeName(OMPC_schedule, M2)
<< getOpenMPSimpleClauseTypeName(OMPC_schedule, M1);
return nullptr;
if (Kind == OMPC_SCHEDULE_unknown) {
std::string Values;
if (M1Loc.isInvalid() && M2Loc.isInvalid()) {
unsigned Exclude[] = {OMPC_SCHEDULE_unknown};
Values = getListOfPossibleValues(OMPC_schedule, /*First=*/0,
} else {
Values = getListOfPossibleValues(OMPC_schedule, /*First=*/0,
Diag(KindLoc, diag::err_omp_unexpected_clause_value)
<< Values << getOpenMPClauseName(OMPC_schedule);
return nullptr;
// OpenMP, 2.7.1, Loop Construct, Restrictions
// The nonmonotonic modifier can only be specified with schedule(dynamic) or
// schedule(guided).
// OpenMP 5.0 does not have this restriction.
if (LangOpts.OpenMP < 50 &&
(M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic ||
M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic) &&
Kind != OMPC_SCHEDULE_dynamic && Kind != OMPC_SCHEDULE_guided) {
Diag(M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic ? M1Loc : M2Loc,
return nullptr;
Expr *ValExpr = ChunkSize;
Stmt *HelperValStmt = nullptr;
if (ChunkSize) {
if (!ChunkSize->isValueDependent() && !ChunkSize->isTypeDependent() &&
!ChunkSize->isInstantiationDependent() &&
!ChunkSize->containsUnexpandedParameterPack()) {
SourceLocation ChunkSizeLoc = ChunkSize->getBeginLoc();
ExprResult Val =
PerformOpenMPImplicitIntegerConversion(ChunkSizeLoc, ChunkSize);
if (Val.isInvalid())
return nullptr;
ValExpr = Val.get();
// OpenMP [2.7.1, Restrictions]
// chunk_size must be a loop invariant integer expression with a positive
// value.
llvm::APSInt Result;
if (ValExpr->isIntegerConstantExpr(Result, Context)) {
if (Result.isSigned() && !Result.isStrictlyPositive()) {
Diag(ChunkSizeLoc, diag::err_omp_negative_expression_in_clause)
<< "schedule" << 1 << ChunkSize->getSourceRange();
return nullptr;
} else if (getOpenMPCaptureRegionForClause(
DSAStack->getCurrentDirective(), OMPC_schedule,
LangOpts.OpenMP) != OMPD_unknown &&
!CurContext->isDependentContext()) {
ValExpr = MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
HelperValStmt = buildPreInits(Context, Captures);
return new (Context)
OMPScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc, Kind,
ValExpr, HelperValStmt, M1, M1Loc, M2, M2Loc);
OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
SourceLocation StartLoc,
SourceLocation EndLoc) {
OMPClause *Res = nullptr;
switch (Kind) {
case OMPC_ordered:
Res = ActOnOpenMPOrderedClause(StartLoc, EndLoc);
case OMPC_nowait:
Res = ActOnOpenMPNowaitClause(StartLoc, EndLoc);
case OMPC_untied:
Res = ActOnOpenMPUntiedClause(StartLoc, EndLoc);
case OMPC_mergeable:
Res = ActOnOpenMPMergeableClause(StartLoc, EndLoc);
case OMPC_read:
Res = ActOnOpenMPReadClause(StartLoc, EndLoc);
case OMPC_write:
Res = ActOnOpenMPWriteClause(StartLoc, EndLoc);
case OMPC_update:
Res = ActOnOpenMPUpdateClause(StartLoc, EndLoc);
case OMPC_capture:
Res = ActOnOpenMPCaptureClause(StartLoc, EndLoc);
case OMPC_seq_cst:
Res = ActOnOpenMPSeqCstClause(StartLoc, EndLoc);
case OMPC_acq_rel:
Res = ActOnOpenMPAcqRelClause(StartLoc, EndLoc);
case OMPC_acquire:
Res = ActOnOpenMPAcquireClause(StartLoc, EndLoc);
case OMPC_release:
Res = ActOnOpenMPReleaseClause(StartLoc, EndLoc);
case OMPC_relaxed:
Res = ActOnOpenMPRelaxedClause(StartLoc, EndLoc);
case OMPC_threads:
Res = ActOnOpenMPThreadsClause(StartLoc, EndLoc);
case OMPC_simd:
Res = ActOnOpenMPSIMDClause(StartLoc, EndLoc);
case OMPC_nogroup:
Res = ActOnOpenMPNogroupClause(StartLoc, EndLoc);
case OMPC_unified_address:
Res = ActOnOpenMPUnifiedAddressClause(StartLoc, EndLoc);
case OMPC_unified_shared_memory:
Res = ActOnOpenMPUnifiedSharedMemoryClause(StartLoc, EndLoc);
case OMPC_reverse_offload:
Res = ActOnOpenMPReverseOffloadClause(StartLoc, EndLoc);
case OMPC_dynamic_allocators:
Res = ActOnOpenMPDynamicAllocatorsClause(StartLoc, EndLoc);
case OMPC_destroy:
Res = ActOnOpenMPDestroyClause(StartLoc, EndLoc);
case OMPC_if:
case OMPC_final:
case OMPC_num_threads:
case OMPC_safelen:
case OMPC_simdlen:
case OMPC_allocator:
case OMPC_collapse:
case OMPC_schedule:
case OMPC_private:
case OMPC_firstprivate:
case OMPC_lastprivate:
case OMPC_shared:
case OMPC_reduction:
case OMPC_task_reduction:
case OMPC_in_reduction:
case OMPC_linear:
case OMPC_aligned:
case OMPC_copyin:
case OMPC_copyprivate:
case OMPC_default:
case OMPC_proc_bind:
case OMPC_threadprivate:
case OMPC_allocate:
case OMPC_flush:
case OMPC_depobj:
case OMPC_depend:
case OMPC_device:
case OMPC_map:
case OMPC_num_teams:
case OMPC_thread_limit:
case OMPC_priority:
case OMPC_grainsize:
case OMPC_num_tasks:
case OMPC_hint:
case OMPC_dist_schedule:
case OMPC_defaultmap:
case OMPC_unknown:
case OMPC_uniform:
case OMPC_to:
case OMPC_from:
case OMPC_use_device_ptr:
case OMPC_use_device_addr:
case OMPC_is_device_ptr:
case OMPC_atomic_default_mem_order:
case OMPC_device_type:
case OMPC_match:
case OMPC_nontemporal:
case OMPC_order:
case OMPC_detach:
case OMPC_inclusive:
case OMPC_exclusive:
case OMPC_uses_allocators:
case OMPC_affinity:
llvm_unreachable("Clause is not allowed.");
return Res;
OMPClause *Sema::ActOnOpenMPNowaitClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPNowaitClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPUntiedClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPUntiedClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPMergeableClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPMergeableClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPReadClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPReadClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPWriteClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPWriteClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPUpdateClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return OMPUpdateClause::Create(Context, StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPCaptureClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPCaptureClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPSeqCstClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPSeqCstClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPAcqRelClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPAcqRelClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPAcquireClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPAcquireClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPReleaseClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPReleaseClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPRelaxedClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPRelaxedClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPThreadsClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPThreadsClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPSIMDClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPSIMDClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPNogroupClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPNogroupClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPUnifiedAddressClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPUnifiedSharedMemoryClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPReverseOffloadClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPDynamicAllocatorsClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPDestroyClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPDestroyClause(StartLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPVarListClause(
OpenMPClauseKind Kind, ArrayRef<Expr *> VarList, Expr *DepModOrTailExpr,
const OMPVarListLocTy &Locs, SourceLocation ColonLoc,
CXXScopeSpec &ReductionOrMapperIdScopeSpec,
DeclarationNameInfo &ReductionOrMapperId, int ExtraModifier,
ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
ArrayRef<SourceLocation> MapTypeModifiersLoc, bool IsMapTypeImplicit,
SourceLocation ExtraModifierLoc) {
SourceLocation StartLoc = Locs.StartLoc;
SourceLocation LParenLoc = Locs.LParenLoc;
SourceLocation EndLoc = Locs.EndLoc;
OMPClause *Res = nullptr;
switch (Kind) {
case OMPC_private:
Res = ActOnOpenMPPrivateClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_firstprivate:
Res = ActOnOpenMPFirstprivateClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_lastprivate:
assert(0 <= ExtraModifier && ExtraModifier <= OMPC_LASTPRIVATE_unknown &&
"Unexpected lastprivate modifier.");
Res = ActOnOpenMPLastprivateClause(
VarList, static_cast<OpenMPLastprivateModifier>(ExtraModifier),
ExtraModifierLoc, ColonLoc, StartLoc, LParenLoc, EndLoc);
case OMPC_shared:
Res = ActOnOpenMPSharedClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_reduction:
assert(0 <= ExtraModifier && ExtraModifier <= OMPC_REDUCTION_unknown &&
"Unexpected lastprivate modifier.");
Res = ActOnOpenMPReductionClause(
VarList, static_cast<OpenMPReductionClauseModifier>(ExtraModifier),
StartLoc, LParenLoc, ExtraModifierLoc, ColonLoc, EndLoc,
ReductionOrMapperIdScopeSpec, ReductionOrMapperId);
case OMPC_task_reduction:
Res = ActOnOpenMPTaskReductionClause(VarList, StartLoc, LParenLoc, ColonLoc,
EndLoc, ReductionOrMapperIdScopeSpec,
case OMPC_in_reduction:
Res = ActOnOpenMPInReductionClause(VarList, StartLoc, LParenLoc, ColonLoc,
EndLoc, ReductionOrMapperIdScopeSpec,
case OMPC_linear:
assert(0 <= ExtraModifier && ExtraModifier <= OMPC_LINEAR_unknown &&
"Unexpected linear modifier.");
Res = ActOnOpenMPLinearClause(
VarList, DepModOrTailExpr, StartLoc, LParenLoc,
static_cast<OpenMPLinearClauseKind>(ExtraModifier), ExtraModifierLoc,
ColonLoc, EndLoc);
case OMPC_aligned:
Res = ActOnOpenMPAlignedClause(VarList, DepModOrTailExpr, StartLoc,
LParenLoc, ColonLoc, EndLoc);
case OMPC_copyin:
Res = ActOnOpenMPCopyinClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_copyprivate:
Res = ActOnOpenMPCopyprivateClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_flush:
Res = ActOnOpenMPFlushClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_depend:
assert(0 <= ExtraModifier && ExtraModifier <= OMPC_DEPEND_unknown &&
"Unexpected depend modifier.");
Res = ActOnOpenMPDependClause(
DepModOrTailExpr, static_cast<OpenMPDependClauseKind>(ExtraModifier),
ExtraModifierLoc, ColonLoc, VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_map:
assert(0 <= ExtraModifier && ExtraModifier <= OMPC_MAP_unknown &&
"Unexpected map modifier.");
Res = ActOnOpenMPMapClause(
MapTypeModifiers, MapTypeModifiersLoc, ReductionOrMapperIdScopeSpec,
ReductionOrMapperId, static_cast<OpenMPMapClauseKind>(ExtraModifier),
IsMapTypeImplicit, ExtraModifierLoc, ColonLoc, VarList, Locs);
case OMPC_to:
Res = ActOnOpenMPToClause(VarList, ReductionOrMapperIdScopeSpec,
ReductionOrMapperId, Locs);
case OMPC_from:
Res = ActOnOpenMPFromClause(VarList, ReductionOrMapperIdScopeSpec,
ReductionOrMapperId, Locs);
case OMPC_use_device_ptr:
Res = ActOnOpenMPUseDevicePtrClause(VarList, Locs);
case OMPC_use_device_addr:
Res = ActOnOpenMPUseDeviceAddrClause(VarList, Locs);
case OMPC_is_device_ptr:
Res = ActOnOpenMPIsDevicePtrClause(VarList, Locs);
case OMPC_allocate:
Res = ActOnOpenMPAllocateClause(DepModOrTailExpr, VarList, StartLoc,
LParenLoc, ColonLoc, EndLoc);
case OMPC_nontemporal:
Res = ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_inclusive:
Res = ActOnOpenMPInclusiveClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_exclusive:
Res = ActOnOpenMPExclusiveClause(VarList, StartLoc, LParenLoc, EndLoc);
case OMPC_affinity:
Res = ActOnOpenMPAffinityClause(StartLoc, LParenLoc, ColonLoc, EndLoc,
DepModOrTailExpr, VarList);
case OMPC_if:
case OMPC_depobj:
case OMPC_final:
case OMPC_num_threads:
case OMPC_safelen:
case OMPC_simdlen:
case OMPC_allocator:
case OMPC_collapse:
case OMPC_default:
case OMPC_proc_bind:
case OMPC_schedule:
case OMPC_ordered:
case OMPC_nowait:
case OMPC_untied:
case OMPC_mergeable:
case OMPC_threadprivate:
case OMPC_read:
case OMPC_write:
case OMPC_update:
case OMPC_capture:
case OMPC_seq_cst:
case OMPC_acq_rel:
case OMPC_acquire:
case OMPC_release:
case OMPC_relaxed:
case OMPC_device:
case OMPC_threads:
case OMPC_simd:
case OMPC_num_teams:
case OMPC_thread_limit:
case OMPC_priority:
case OMPC_grainsize:
case OMPC_nogroup:
case OMPC_num_tasks:
case OMPC_hint:
case OMPC_dist_schedule:
case OMPC_defaultmap:
case OMPC_unknown:
case OMPC_uniform:
case OMPC_unified_address:
case OMPC_unified_shared_memory:
case OMPC_reverse_offload:
case OMPC_dynamic_allocators:
case OMPC_atomic_default_mem_order:
case OMPC_device_type:
case OMPC_match:
case OMPC_order:
case OMPC_destroy:
case OMPC_detach:
case OMPC_uses_allocators:
llvm_unreachable("Clause is not allowed.");
return Res;
ExprResult Sema::getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK,
ExprObjectKind OK, SourceLocation Loc) {
ExprResult Res = BuildDeclRefExpr(
Capture, Capture->getType().getNonReferenceType(), VK_LValue, Loc);
if (!Res.isUsable())
return ExprError();
if (OK == OK_Ordinary && !getLangOpts().CPlusPlus) {
Res = CreateBuiltinUnaryOp(Loc, UO_Deref, Res.get());
if (!Res.isUsable())
return ExprError();
if (VK != VK_LValue && Res.get()->isGLValue()) {
Res = DefaultLvalueConversion(Res.get());
if (!Res.isUsable())
return ExprError();
return Res;
OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
SmallVector<Expr *, 8> PrivateCopies;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP private clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
QualType Type = D->getType();
auto *VD = dyn_cast<VarDecl>(D);
// OpenMP [, Restrictions, C/C++, p.3]
// A variable that appears in a private clause must not have an incomplete
// type or a reference type.
if (RequireCompleteType(ELoc, Type, diag::err_omp_private_incomplete_type))
Type = Type.getNonReferenceType();
// OpenMP 5.0 [2.19.3, List Item Privatization, Restrictions]
// A variable that is privatized must not have a const-qualified type
// unless it is of class type with a mutable member. This restriction does
// not apply to the firstprivate clause.
// OpenMP 3.1 [, private clause, Restrictions]
// A variable that appears in a private clause must not have a
// const-qualified type unless it is of class type with a mutable member.
if (rejectConstNotMutableType(*this, D, Type, OMPC_private, ELoc))
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct]
// Variables with the predetermined data-sharing attributes may not be
// listed in data-sharing attributes clauses, except for the cases
// listed below. For these exceptions only, listing a predetermined
// variable in a data-sharing attribute clause is allowed and overrides
// the variable's predetermined data-sharing attributes.
DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /*FromParent=*/false);
if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_private) {
Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_private);
reportOriginalDsa(*this, DSAStack, D, DVar);
OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
// Variably modified types are not supported for tasks.
if (!Type->isAnyPointerType() && Type->isVariablyModifiedType() &&
isOpenMPTaskingDirective(CurrDir)) {
Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
<< getOpenMPClauseName(OMPC_private) << Type
<< getOpenMPDirectiveName(CurrDir);
bool IsDecl =
!VD ||
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
// OpenMP 4.5 [, Restrictions, p.3]
// A list item cannot appear in both a map clause and a data-sharing
// attribute clause on the same construct
// OpenMP 5.0 [, Restrictions, p.7]
// A list item cannot appear in both a map clause and a data-sharing
// attribute clause on the same construct unless the construct is a
// combined construct.
if ((LangOpts.OpenMP <= 45 && isOpenMPTargetExecutionDirective(CurrDir)) ||
CurrDir == OMPD_target) {
OpenMPClauseKind ConflictKind;
if (DSAStack->checkMappableExprComponentListsForDecl(
VD, /*CurrentRegionOnly=*/true,
OpenMPClauseKind WhereFoundClauseKind) -> bool {
ConflictKind = WhereFoundClauseKind;
return true;
})) {
Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
<< getOpenMPClauseName(OMPC_private)
<< getOpenMPClauseName(ConflictKind)
<< getOpenMPDirectiveName(CurrDir);
reportOriginalDsa(*this, DSAStack, D, DVar);
// OpenMP [, Restrictions, C/C++, p.1]
// A variable of class type (or array thereof) that appears in a private
// clause requires an accessible, unambiguous default constructor for the
// class type.
// Generate helper private variable and initialize it with the default
// value. The address of the original variable is replaced by the address of
// the new private variable in CodeGen. This new variable is not added to
// IdResolver, so the code in the OpenMP region uses original variable for
// proper diagnostics.
Type = Type.getUnqualifiedType();
VarDecl *VDPrivate =
buildVarDecl(*this, ELoc, Type, D->getName(),
D->hasAttrs() ? &D->getAttrs() : nullptr,
VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
if (VDPrivate->isInvalidDecl())
DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
*this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);
DeclRefExpr *Ref = nullptr;
if (!VD && !CurContext->isDependentContext())
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_private, Ref);
Vars.push_back((VD || CurContext->isDependentContext())
? RefExpr->IgnoreParens()
: Ref);
if (Vars.empty())
return nullptr;
return OMPPrivateClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars,
namespace {
class DiagsUninitializedSeveretyRAII {
DiagnosticsEngine &Diags;
SourceLocation SavedLoc;
bool IsIgnored = false;
DiagsUninitializedSeveretyRAII(DiagnosticsEngine &Diags, SourceLocation Loc,
bool IsIgnored)
: Diags(Diags), SavedLoc(Loc), IsIgnored(IsIgnored) {
if (!IsIgnored) {
Diags.setSeverity(/*Diag*/ diag::warn_uninit_self_reference_in_init,
/*Map*/ diag::Severity::Ignored, Loc);
~DiagsUninitializedSeveretyRAII() {
if (!IsIgnored)
OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
SmallVector<Expr *, 8> PrivateCopies;
SmallVector<Expr *, 8> Inits;
SmallVector<Decl *, 4> ExprCaptures;
bool IsImplicitClause =
StartLoc.isInvalid() && LParenLoc.isInvalid() && EndLoc.isInvalid();
SourceLocation ImplicitClauseLoc = DSAStack->getConstructLoc();
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP firstprivate clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
ELoc = IsImplicitClause ? ImplicitClauseLoc : ELoc;
QualType Type = D->getType();
auto *VD = dyn_cast<VarDecl>(D);
// OpenMP [, Restrictions, C/C++, p.3]
// A variable that appears in a private clause must not have an incomplete
// type or a reference type.
if (RequireCompleteType(ELoc, Type,
Type = Type.getNonReferenceType();
// OpenMP [, Restrictions, C/C++, p.1]
// A variable of class type (or array thereof) that appears in a private
// clause requires an accessible, unambiguous copy constructor for the
// class type.
QualType ElemType = Context.getBaseElementType(Type).getNonReferenceType();
// If an implicit firstprivate variable found it was checked already.
DSAStackTy::DSAVarData TopDVar;
if (!IsImplicitClause) {
DSAStackTy::DSAVarData DVar =
DSAStack->getTopDSA(D, /*FromParent=*/false);
TopDVar = DVar;
OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
bool IsConstant = ElemType.isConstant(Context);
// OpenMP [2.4.13, Data-sharing Attribute Clauses]
// A list item that specifies a given variable may not appear in more
// than one clause on the same directive, except that a variable may be
// specified in both firstprivate and lastprivate clauses.
// OpenMP 4.5 [2.10.8, Distribute Construct, p.3]
// A list item may appear in a firstprivate or lastprivate clause but not
// both.
if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_firstprivate &&
(isOpenMPDistributeDirective(CurrDir) ||
DVar.CKind != OMPC_lastprivate) &&
DVar.RefExpr) {
Diag(ELoc, diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_firstprivate);
reportOriginalDsa(*this, DSAStack, D, DVar);
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct]
// Variables with the predetermined data-sharing attributes may not be
// listed in data-sharing attributes clauses, except for the cases
// listed below. For these exceptions only, listing a predetermined
// variable in a data-sharing attribute clause is allowed and overrides
// the variable's predetermined data-sharing attributes.
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct, C/C++, p.2]
// Variables with const-qualified type having no mutable member may be
// listed in a firstprivate clause, even if they are static data members.
if (!(IsConstant || (VD && VD->isStaticDataMember())) && !DVar.RefExpr &&
DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_shared) {
Diag(ELoc, diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_firstprivate);
reportOriginalDsa(*this, DSAStack, D, DVar);
// OpenMP [, Restrictions, p.2]
// A list item that is private within a parallel region must not appear
// in a firstprivate clause on a worksharing construct if any of the
// worksharing regions arising from the worksharing construct ever bind
// to any of the parallel regions arising from the parallel construct.
// OpenMP 4.5 [, Restrictions, p.3]
// A list item that is private within a teams region must not appear in a
// firstprivate clause on a distribute construct if any of the distribute
// regions arising from the distribute construct ever bind to any of the
// teams regions arising from the teams construct.
// OpenMP 4.5 [, Restrictions, p.3]
// A list item that appears in a reduction clause of a teams construct
// must not appear in a firstprivate clause on a distribute construct if
// any of the distribute regions arising from the distribute construct
// ever bind to any of the teams regions arising from the teams construct.
if ((isOpenMPWorksharingDirective(CurrDir) ||
isOpenMPDistributeDirective(CurrDir)) &&
!isOpenMPParallelDirective(CurrDir) &&
!isOpenMPTeamsDirective(CurrDir)) {
DVar = DSAStack->getImplicitDSA(D, true);
if (DVar.CKind != OMPC_shared &&
(isOpenMPParallelDirective(DVar.DKind) ||
isOpenMPTeamsDirective(DVar.DKind) ||
DVar.DKind == OMPD_unknown)) {
Diag(ELoc, diag::err_omp_required_access)
<< getOpenMPClauseName(OMPC_firstprivate)
<< getOpenMPClauseName(OMPC_shared);
reportOriginalDsa(*this, DSAStack, D, DVar);
// OpenMP [, Restrictions, p.3]
// A list item that appears in a reduction clause of a parallel construct
// must not appear in a firstprivate clause on a worksharing or task
// construct if any of the worksharing or task regions arising from the
// worksharing or task construct ever bind to any of the parallel regions
// arising from the parallel construct.
// OpenMP [, Restrictions, p.4]
// A list item that appears in a reduction clause in worksharing
// construct must not appear in a firstprivate clause in a task construct
// encountered during execution of any of the worksharing regions arising
// from the worksharing construct.
if (isOpenMPTaskingDirective(CurrDir)) {
DVar = DSAStack->hasInnermostDSA(
D, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
[](OpenMPDirectiveKind K) {
return isOpenMPParallelDirective(K) ||
isOpenMPWorksharingDirective(K) ||
if (DVar.CKind == OMPC_reduction &&
(isOpenMPParallelDirective(DVar.DKind) ||
isOpenMPWorksharingDirective(DVar.DKind) ||
isOpenMPTeamsDirective(DVar.DKind))) {
Diag(ELoc, diag::err_omp_parallel_reduction_in_task_firstprivate)
<< getOpenMPDirectiveName(DVar.DKind);
reportOriginalDsa(*this, DSAStack, D, DVar);
// OpenMP 4.5 [, Restrictions, p.3]
// A list item cannot appear in both a map clause and a data-sharing
// attribute clause on the same construct
// OpenMP 5.0 [, Restrictions, p.7]
// A list item cannot appear in both a map clause and a data-sharing
// attribute clause on the same construct unless the construct is a
// combined construct.
if ((LangOpts.OpenMP <= 45 &&
isOpenMPTargetExecutionDirective(CurrDir)) ||
CurrDir == OMPD_target) {
OpenMPClauseKind ConflictKind;
if (DSAStack->checkMappableExprComponentListsForDecl(
VD, /*CurrentRegionOnly=*/true,
OpenMPClauseKind WhereFoundClauseKind) {
ConflictKind = WhereFoundClauseKind;
return true;
})) {
Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
<< getOpenMPClauseName(OMPC_firstprivate)
<< getOpenMPClauseName(ConflictKind)
<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
reportOriginalDsa(*this, DSAStack, D, DVar);
// Variably modified types are not supported for tasks.
if (!Type->isAnyPointerType() && Type->isVariablyModifiedType() &&
isOpenMPTaskingDirective(DSAStack->getCurrentDirective())) {
Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
<< getOpenMPClauseName(OMPC_firstprivate) << Type
<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
bool IsDecl =
!VD ||
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
Type = Type.getUnqualifiedType();
VarDecl *VDPrivate =
buildVarDecl(*this, ELoc, Type, D->getName(),
D->hasAttrs() ? &D->getAttrs() : nullptr,
VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
// Generate helper private variable and initialize it with the value of the
// original variable. The address of the original variable is replaced by
// the address of the new private variable in the CodeGen. This new variable
// is not added to IdResolver, so the code in the OpenMP region uses
// original variable for proper diagnostics and variable capturing.
Expr *VDInitRefExpr = nullptr;
// For arrays generate initializer for single element and replace it by the
// original array element in CodeGen.
if (Type->isArrayType()) {
VarDecl *VDInit =
buildVarDecl(*this, RefExpr->getExprLoc(), ElemType, D->getName());
VDInitRefExpr = buildDeclRefExpr(*this, VDInit, ElemType, ELoc);
Expr *Init = DefaultLvalueConversion(VDInitRefExpr).get();
ElemType = ElemType.getUnqualifiedType();
VarDecl *VDInitTemp = buildVarDecl(*this, RefExpr->getExprLoc(), ElemType,
InitializedEntity Entity =
InitializationKind Kind = InitializationKind::CreateCopy(ELoc, ELoc);
InitializationSequence InitSeq(*this, Entity, Kind, Init);
ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Init);
if (Result.isInvalid())
// Remove temp variable declaration.
} else {
VarDecl *VDInit = buildVarDecl(*this, RefExpr->getExprLoc(), Type,
VDInitRefExpr = buildDeclRefExpr(*this, VDInit, RefExpr->getType(),
if (VDPrivate->isInvalidDecl()) {
if (IsImplicitClause) {
DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
*this, VDPrivate, RefExpr->getType().getUnqualifiedType(),
DeclRefExpr *Ref = nullptr;
if (!VD && !CurContext->isDependentContext()) {
if (TopDVar.CKind == OMPC_lastprivate) {
Ref = TopDVar.PrivateCopy;
} else {
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
if (!isOpenMPCapturedDecl(D))
if (!IsImplicitClause)
DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);
Vars.push_back((VD || CurContext->isDependentContext())
? RefExpr->IgnoreParens()
: Ref);
if (Vars.empty())
return nullptr;
return OMPFirstprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
Vars, PrivateCopies, Inits,
buildPreInits(Context, ExprCaptures));
OMPClause *Sema::ActOnOpenMPLastprivateClause(
ArrayRef<Expr *> VarList, OpenMPLastprivateModifier LPKind,
SourceLocation LPKindLoc, SourceLocation ColonLoc, SourceLocation StartLoc,
SourceLocation LParenLoc, SourceLocation EndLoc) {
if (LPKind == OMPC_LASTPRIVATE_unknown && LPKindLoc.isValid()) {
assert(ColonLoc.isValid() && "Colon location must be valid.");
Diag(LPKindLoc, diag::err_omp_unexpected_clause_value)
<< getListOfPossibleValues(OMPC_lastprivate, /*First=*/0,
<< getOpenMPClauseName(OMPC_lastprivate);
return nullptr;
SmallVector<Expr *, 8> Vars;
SmallVector<Expr *, 8> SrcExprs;
SmallVector<Expr *, 8> DstExprs;
SmallVector<Expr *, 8> AssignmentOps;
SmallVector<Decl *, 4> ExprCaptures;
SmallVector<Expr *, 4> ExprPostUpdates;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP lastprivate clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
QualType Type = D->getType();
auto *VD = dyn_cast<VarDecl>(D);
// OpenMP [, Restrictions, C/C++, p.2]
// A variable that appears in a lastprivate clause must not have an
// incomplete type or a reference type.
if (RequireCompleteType(ELoc, Type,
Type = Type.getNonReferenceType();
// OpenMP 5.0 [2.19.3, List Item Privatization, Restrictions]
// A variable that is privatized must not have a const-qualified type
// unless it is of class type with a mutable member. This restriction does
// not apply to the firstprivate clause.
// OpenMP 3.1 [, lastprivate clause, Restrictions]
// A variable that appears in a lastprivate clause must not have a
// const-qualified type unless it is of class type with a mutable member.
if (rejectConstNotMutableType(*this, D, Type, OMPC_lastprivate, ELoc))
// OpenMP 5.0 [ lastprivate Clause, Restrictions]
// A list item that appears in a lastprivate clause with the conditional
// modifier must be a scalar variable.
if (LPKind == OMPC_LASTPRIVATE_conditional && !Type->isScalarType()) {
Diag(ELoc, diag::err_omp_lastprivate_conditional_non_scalar);
bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct]
// Variables with the predetermined data-sharing attributes may not be
// listed in data-sharing attributes clauses, except for the cases
// listed below.
// OpenMP 4.5 [2.10.8, Distribute Construct, p.3]
// A list item may appear in a firstprivate or lastprivate clause but not
// both.
DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /*FromParent=*/false);
if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_lastprivate &&
(isOpenMPDistributeDirective(CurrDir) ||
DVar.CKind != OMPC_firstprivate) &&
(DVar.CKind != OMPC_private || DVar.RefExpr != nullptr)) {
Diag(ELoc, diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_lastprivate);
reportOriginalDsa(*this, DSAStack, D, DVar);
// OpenMP [, Restrictions, p.2]
// A list item that is private within a parallel region, or that appears in
// the reduction clause of a parallel construct, must not appear in a
// lastprivate clause on a worksharing construct if any of the corresponding
// worksharing regions ever binds to any of the corresponding parallel
// regions.
DSAStackTy::DSAVarData TopDVar = DVar;
if (isOpenMPWorksharingDirective(CurrDir) &&
!isOpenMPParallelDirective(CurrDir) &&
!isOpenMPTeamsDirective(CurrDir)) {
DVar = DSAStack->getImplicitDSA(D, true);
if (DVar.CKind != OMPC_shared) {
Diag(ELoc, diag::err_omp_required_access)
<< getOpenMPClauseName(OMPC_lastprivate)
<< getOpenMPClauseName(OMPC_shared);
reportOriginalDsa(*this, DSAStack, D, DVar);
// OpenMP [, Restrictions, C++, p.1,2]
// A variable of class type (or array thereof) that appears in a
// lastprivate clause requires an accessible, unambiguous default
// constructor for the class type, unless the list item is also specified
// in a firstprivate clause.
// A variable of class type (or array thereof) that appears in a
// lastprivate clause requires an accessible, unambiguous copy assignment
// operator for the class type.
Type = Context.getBaseElementType(Type).getNonReferenceType();
VarDecl *SrcVD = buildVarDecl(*this, ERange.getBegin(),
Type.getUnqualifiedType(), ".lastprivate.src",
D->hasAttrs() ? &D->getAttrs() : nullptr);
DeclRefExpr *PseudoSrcExpr =
buildDeclRefExpr(*this, SrcVD, Type.getUnqualifiedType(), ELoc);
VarDecl *DstVD =
buildVarDecl(*this, ERange.getBegin(), Type, ".lastprivate.dst",
D->hasAttrs() ? &D->getAttrs() : nullptr);
DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(*this, DstVD, Type, ELoc);
// For arrays generate assignment operation for single element and replace
// it by the original array element in CodeGen.
ExprResult AssignmentOp = BuildBinOp(/*S=*/nullptr, ELoc, BO_Assign,
PseudoDstExpr, PseudoSrcExpr);
if (AssignmentOp.isInvalid())
AssignmentOp =
ActOnFinishFullExpr(AssignmentOp.get(), ELoc, /*DiscardedValue*/ false);
if (AssignmentOp.isInvalid())
DeclRefExpr *Ref = nullptr;
if (!VD && !CurContext->isDependentContext()) {
if (TopDVar.CKind == OMPC_firstprivate) {
Ref = TopDVar.PrivateCopy;
} else {
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
if (!isOpenMPCapturedDecl(D))
if (TopDVar.CKind == OMPC_firstprivate ||
(!isOpenMPCapturedDecl(D) &&
Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>())) {
ExprResult RefRes = DefaultLvalueConversion(Ref);
if (!RefRes.isUsable())
ExprResult PostUpdateRes =
BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign, SimpleRefExpr,
if (!PostUpdateRes.isUsable())
DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_lastprivate, Ref);
Vars.push_back((VD || CurContext->isDependentContext())
? RefExpr->IgnoreParens()
: Ref);
if (Vars.empty())
return nullptr;
return OMPLastprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
Vars, SrcExprs, DstExprs, AssignmentOps,
LPKind, LPKindLoc, ColonLoc,
buildPreInits(Context, ExprCaptures),
buildPostUpdate(*this, ExprPostUpdates));
OMPClause *Sema::ActOnOpenMPSharedClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP lastprivate clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
auto *VD = dyn_cast<VarDecl>(D);
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct]
// Variables with the predetermined data-sharing attributes may not be
// listed in data-sharing attributes clauses, except for the cases
// listed below. For these exceptions only, listing a predetermined
// variable in a data-sharing attribute clause is allowed and overrides
// the variable's predetermined data-sharing attributes.
DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /*FromParent=*/false);
if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_shared &&
DVar.RefExpr) {
Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_shared);
reportOriginalDsa(*this, DSAStack, D, DVar);
DeclRefExpr *Ref = nullptr;
if (!VD && isOpenMPCapturedDecl(D) && !CurContext->isDependentContext())
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_shared, Ref);
Vars.push_back((VD || !Ref || CurContext->isDependentContext())
? RefExpr->IgnoreParens()
: Ref);
if (Vars.empty())
return nullptr;
return OMPSharedClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
namespace {
class DSARefChecker : public StmtVisitor<DSARefChecker, bool> {
DSAStackTy *Stack;
bool VisitDeclRefExpr(DeclRefExpr *E) {
if (auto *VD = dyn_cast<VarDecl>(E->getDecl())) {
DSAStackTy::DSAVarData DVar = Stack->getTopDSA(VD, /*FromParent=*/false);
if (DVar.CKind == OMPC_shared && !DVar.RefExpr)
return false;
if (DVar.CKind != OMPC_unknown)
return true;
DSAStackTy::DSAVarData DVarPrivate = Stack->hasDSA(
VD, isOpenMPPrivate, [](OpenMPDirectiveKind) { return true; },
return DVarPrivate.CKind != OMPC_unknown;
return false;
bool VisitStmt(Stmt *S) {
for (Stmt *Child : S->children()) {
if (Child && Visit(Child))
return true;
return false;
explicit DSARefChecker(DSAStackTy *S) : Stack(S) {}
} // namespace
namespace {
// Transform MemberExpression for specified FieldDecl of current class to
// DeclRefExpr to specified OMPCapturedExprDecl.
class TransformExprToCaptures : public TreeTransform<TransformExprToCaptures> {
typedef TreeTransform<TransformExprToCaptures> BaseTransform;
ValueDecl *Field = nullptr;
DeclRefExpr *CapturedExpr = nullptr;
TransformExprToCaptures(Sema &SemaRef, ValueDecl *FieldDecl)
: BaseTransform(SemaRef), Field(FieldDecl), CapturedExpr(nullptr) {}
ExprResult TransformMemberExpr(MemberExpr *E) {
if (isa<CXXThisExpr>(E->getBase()->IgnoreParenImpCasts()) &&
E->getMemberDecl() == Field) {
CapturedExpr = buildCapture(SemaRef, Field, E, /*WithInit=*/false);
return CapturedExpr;
return BaseTransform::TransformMemberExpr(E);
DeclRefExpr *getCapturedExpr() { return CapturedExpr; }
} // namespace
template <typename T, typename U>
static T filterLookupForUDReductionAndMapper(
SmallVectorImpl<U> &Lookups, const llvm::function_ref<T(ValueDecl *)> Gen) {
for (U &Set : Lookups) {
for (auto *D : Set) {
if (T Res = Gen(cast<ValueDecl>(D)))
return Res;
return T();
static NamedDecl *findAcceptableDecl(Sema &SemaRef, NamedDecl *D) {
assert(!LookupResult::isVisible(SemaRef, D) && "not in slow case");
for (auto RD : D->redecls()) {
// Don't bother with extra checks if we already know this one isn't visible.
if (RD == D)
auto ND = cast<NamedDecl>(RD);
if (LookupResult::isVisible(SemaRef, ND))
return ND;
return nullptr;
static void
argumentDependentLookup(Sema &SemaRef, const DeclarationNameInfo &Id,
SourceLocation Loc, QualType Ty,
SmallVectorImpl<UnresolvedSet<8>> &Lookups) {
// Find all of the associated namespaces and classes based on the
// arguments we have.
Sema::AssociatedNamespaceSet AssociatedNamespaces;
Sema::AssociatedClassSet AssociatedClasses;
OpaqueValueExpr OVE(Loc, Ty, VK_LValue);
SemaRef.FindAssociatedClassesAndNamespaces(Loc, &OVE, AssociatedNamespaces,
// C++ [basic.lookup.argdep]p3:
// Let X be the lookup set produced by unqualified lookup (3.4.1)
// and let Y be the lookup set produced by argument dependent
// lookup (defined as follows). If X contains [...] then Y is
// empty. Otherwise Y is the set of declarations found in the
// namespaces associated with the argument types as described
// below. The set of declarations found by the lookup of the name
// is the union of X and Y.
// Here, we compute Y and add its members to the overloaded
// candidate set.
for (auto *NS : AssociatedNamespaces) {
// When considering an associated namespace, the lookup is the
// same as the lookup performed when the associated namespace is
// used as a qualifier ( except that:
// -- Any using-directives in the associated namespace are
// ignored.
// -- Any namespace-scope friend functions declared in
// associated classes are visible within their respective
// namespaces even if they are not visible during an ordinary
// lookup (11.4).
DeclContext::lookup_result R = NS->lookup(Id.getName());
for (auto *D : R) {
auto *Underlying = D;
if (auto *USD = dyn_cast<UsingShadowDecl>(D))
Underlying = USD->getTargetDecl();
if (!isa<OMPDeclareReductionDecl>(Underlying) &&
if (!SemaRef.isVisible(D)) {
D = findAcceptableDecl(SemaRef, D);
if (!D)
if (auto *USD = dyn_cast<UsingShadowDecl>(D))
Underlying = USD->getTargetDecl();
static ExprResult
buildDeclareReductionRef(Sema &SemaRef, SourceLocation Loc, SourceRange Range,
Scope *S, CXXScopeSpec &ReductionIdScopeSpec,
const DeclarationNameInfo &ReductionId, QualType Ty,
CXXCastPath &BasePath, Expr *UnresolvedReduction) {
if (ReductionIdScopeSpec.isInvalid())
return ExprError();
SmallVector<UnresolvedSet<8>, 4> Lookups;
if (S) {
LookupResult Lookup(SemaRef, ReductionId, Sema::LookupOMPReductionName);
while (S && SemaRef.LookupParsedName(Lookup, S, &ReductionIdScopeSpec)) {
NamedDecl *D = Lookup.getRepresentativeDecl();
do {
S = S->getParent();
} while (S && !S->isDeclScope(D));
if (S)
S = S->getParent();
Lookups.back().append(Lookup.begin(), Lookup.end());
} else if (auto *ULE =
cast_or_null<UnresolvedLookupExpr>(UnresolvedReduction)) {
Decl *PrevD = nullptr;
for (NamedDecl *D : ULE->decls()) {
if (D == PrevD)
else if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(D))
PrevD = D;
if (SemaRef.CurContext->isDependentContext() || Ty->isDependentType() ||
Ty->isInstantiationDependentType() ||
Ty->containsUnexpandedParameterPack() ||
filterLookupForUDReductionAndMapper<bool>(Lookups, [](ValueDecl *D) {
return !D->isInvalidDecl() &&
(D->getType()->isDependentType() ||
D->getType()->isInstantiationDependentType() ||
})) {
UnresolvedSet<8> ResSet;
for (const UnresolvedSet<8> &Set : Lookups) {
if (Set.empty())
ResSet.append(Set.begin(), Set.end());
// The last item marks the end of all declarations at the specified scope.
ResSet.addDecl(Set[Set.size() - 1]);
return UnresolvedLookupExpr::Create(
SemaRef.Context, /*NamingClass=*/nullptr,
ReductionIdScopeSpec.getWithLocInContext(SemaRef.Context), ReductionId,
/*ADL=*/true, /*Overloaded=*/true, ResSet.begin(), ResSet.end());
// Lookup inside the classes.
// C++ [over.match.oper]p3:
// For a unary operator @ with an operand of a type whose
// cv-unqualified version is T1, and for a binary operator @ with
// a left operand of a type whose cv-unqualified version is T1 and
// a right operand of a type whose cv-unqualified version is T2,
// three sets of candidate functions, designated member
// candidates, non-member candidates and built-in candidates, are
// constructed as follows:
// -- If T1 is a complete class type or a class currently being
// defined, the set of member candidates is the result of the
// qualified lookup of T1::operator@ (; otherwise,
// the set of member candidates is empty.
LookupResult Lookup(SemaRef, ReductionId, Sema::LookupOMPReductionName);
if (const auto *TyRec = Ty->getAs<RecordType>()) {
// Complete the type if it can be completed.
// If the type is neither complete nor being defined, bail out now.
if (SemaRef.isCompleteType(Loc, Ty) || TyRec->isBeingDefined() ||
TyRec->getDecl()->getDefinition()) {
SemaRef.LookupQualifiedName(Lookup, TyRec->getDecl());
if (Lookup.empty()) {
Lookups.back().append(Lookup.begin(), Lookup.end());
// Perform ADL.
if (SemaRef.getLangOpts().CPlusPlus)
argumentDependentLookup(SemaRef, ReductionId, Loc, Ty, Lookups);
if (auto *VD = filterLookupForUDReductionAndMapper<ValueDecl *>(
Lookups, [&SemaRef, Ty](ValueDecl *D) -> ValueDecl * {
if (!D->isInvalidDecl() &&
SemaRef.Context.hasSameType(D->getType(), Ty))
return D;
return nullptr;
return SemaRef.BuildDeclRefExpr(VD, VD->getType().getNonReferenceType(),
VK_LValue, Loc);
if (SemaRef.getLangOpts().CPlusPlus) {
if (auto *VD = filterLookupForUDReductionAndMapper<ValueDecl *>(
Lookups, [&SemaRef, Ty, Loc](ValueDecl *D) -> ValueDecl * {
if (!D->isInvalidDecl() &&
SemaRef.IsDerivedFrom(Loc, Ty, D->getType()) &&
return D;
return nullptr;
})) {
CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
if (SemaRef.IsDerivedFrom(Loc, Ty, VD->getType(), Paths)) {
if (!Paths.isAmbiguous(SemaRef.Context.getCanonicalType(
VD->getType().getUnqualifiedType()))) {
if (SemaRef.CheckBaseClassAccess(
Loc, VD->getType(), Ty, Paths.front(),
/*DiagID=*/0) != Sema::AR_inaccessible) {
SemaRef.BuildBasePathArray(Paths, BasePath);
return SemaRef.BuildDeclRefExpr(
VD, VD->getType().getNonReferenceType(), VK_LValue, Loc);
if (ReductionIdScopeSpec.isSet()) {
SemaRef.Diag(Loc, diag::err_omp_not_resolved_reduction_identifier)
<< Ty << Range;
return ExprError();
return ExprEmpty();
namespace {
/// Data for the reduction-based clauses.
struct ReductionData {
/// List of original reduction items.
SmallVector<Expr *, 8> Vars;
/// List of private copies of the reduction items.
SmallVector<Expr *, 8> Privates;
/// LHS expressions for the reduction_op expressions.
SmallVector<Expr *, 8> LHSs;
/// RHS expressions for the reduction_op expressions.
SmallVector<Expr *, 8> RHSs;
/// Reduction operation expression.
SmallVector<Expr *, 8> ReductionOps;
/// inscan copy operation expressions.
SmallVector<Expr *, 8> InscanCopyOps;
/// inscan copy temp array expressions for prefix sums.
SmallVector<Expr *, 8> InscanCopyArrayTemps;
/// inscan copy temp array element expressions for prefix sums.
SmallVector<Expr *, 8> InscanCopyArrayElems;
/// Taskgroup descriptors for the corresponding reduction items in
/// in_reduction clauses.
SmallVector<Expr *, 8> TaskgroupDescriptors;
/// List of captures for clause.
SmallVector<Decl *, 4> ExprCaptures;
/// List of postupdate expressions.
SmallVector<Expr *, 4> ExprPostUpdates;
/// Reduction modifier.
unsigned RedModifier = 0;
ReductionData() = delete;
/// Reserves required memory for the reduction data.
ReductionData(unsigned Size, unsigned Modifier = 0) : RedModifier(Modifier) {
if (RedModifier == OMPC_REDUCTION_inscan) {
/// Stores reduction item and reduction operation only (required for dependent
/// reduction item).
void push(Expr *Item, Expr *ReductionOp) {
if (RedModifier == OMPC_REDUCTION_inscan) {
/// Stores reduction data.
void push(Expr *Item, Expr *Private, Expr *LHS, Expr *RHS, Expr *ReductionOp,
Expr *TaskgroupDescriptor, Expr *CopyOp, Expr *CopyArrayTemp,
Expr *CopyArrayElem) {
if (RedModifier == OMPC_REDUCTION_inscan) {
} else {
assert(CopyOp == nullptr && CopyArrayTemp == nullptr &&
CopyArrayElem == nullptr &&
"Copy operation must be used for inscan reductions only.");
} // namespace
static bool checkOMPArraySectionConstantForReduction(
ASTContext &Context, const OMPArraySectionExpr *OASE, bool &SingleElement,
SmallVectorImpl<llvm::APSInt> &ArraySizes) {
const Expr *Length = OASE->getLength();
if (Length == nullptr) {
// For array sections of the form [1:] or [:], we would need to analyze
// the lower bound...
if (OASE->getColonLocFirst().isValid())
return false;
// This is an array subscript which has implicit length 1!
SingleElement = true;
} else {
Expr::EvalResult Result;
if (!Length->EvaluateAsInt(Result, Context))
return false;
llvm::APSInt ConstantLengthValue = Result.Val.getInt();
SingleElement = (ConstantLengthValue.getSExtValue() == 1);
// Get the base of this array section and walk up from there.
const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
// We require length = 1 for all array sections except the right-most to
// guarantee that the memory region is contiguous and has no holes in it.
while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) {
Length = TempOASE->getLength();
if (Length == nullptr) {
// For array sections of the form [1:] or [:], we would need to analyze
// the lower bound...
if (OASE->getColonLocFirst().isValid())
return false;
// This is an array subscript which has implicit length 1!
} else {
Expr::EvalResult Result;
if (!Length->EvaluateAsInt(Result, Context))
return false;
llvm::APSInt ConstantLengthValue = Result.Val.getInt();
if (ConstantLengthValue.getSExtValue() != 1)
return false;
Base = TempOASE->getBase()->IgnoreParenImpCasts();
// If we have a single element, we don't need to add the implicit lengths.
if (!SingleElement) {
while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) {
// Has implicit length 1!
Base = TempASE->getBase()->IgnoreParenImpCasts();
// This array section can be privatized as a single value or as a constant
// sized array.
return true;
static bool actOnOMPReductionKindClause(
Sema &S, DSAStackTy *Stack, OpenMPClauseKind ClauseKind,
ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
SourceLocation ColonLoc, SourceLocation EndLoc,
CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
ArrayRef<Expr *> UnresolvedReductions, ReductionData &RD) {
DeclarationName DN = ReductionId.getName();
OverloadedOperatorKind OOK = DN.getCXXOverloadedOperator();
BinaryOperatorKind BOK = BO_Comma;
ASTContext &Context = S.Context;
// OpenMP [, reduction clause]
// C
// reduction-identifier is either an identifier or one of the following
// operators: +, -, *, &, |, ^, && and ||
// C++
// reduction-identifier is either an id-expression or one of the following
// operators: +, -, *, &, |, ^, && and ||
switch (OOK) {
case OO_Plus:
case OO_Minus:
BOK = BO_Add;
case OO_Star:
BOK = BO_Mul;
case OO_Amp:
BOK = BO_And;
case OO_Pipe:
BOK = BO_Or;
case OO_Caret:
BOK = BO_Xor;
case OO_AmpAmp:
BOK = BO_LAnd;
case OO_PipePipe:
case OO_New:
case OO_Delete:
case OO_Array_New:
case OO_Array_Delete:
case OO_Slash:
case OO_Percent:
case OO_Tilde:
case OO_Exclaim:
case OO_Equal:
case OO_Less:
case OO_Greater:
case OO_LessEqual:
case OO_GreaterEqual:
case OO_PlusEqual:
case OO_MinusEqual:
case OO_StarEqual:
case OO_SlashEqual:
case OO_PercentEqual:
case OO_CaretEqual:
case OO_AmpEqual:
case OO_PipeEqual:
case OO_LessLess:
case OO_GreaterGreater:
case OO_LessLessEqual:
case OO_GreaterGreaterEqual:
case OO_EqualEqual:
case OO_ExclaimEqual:
case OO_Spaceship:
case OO_PlusPlus:
case OO_MinusMinus:
case OO_Comma:
case OO_ArrowStar:
case OO_Arrow:
case OO_Call:
case OO_Subscript:
case OO_Conditional:
case OO_Coawait:
llvm_unreachable("Unexpected reduction identifier");
case OO_None:
if (IdentifierInfo *II = DN.getAsIdentifierInfo()) {
if (II->isStr("max"))
else if (II->isStr("min"))
SourceRange ReductionIdRange;
if (ReductionIdScopeSpec.isValid())
auto IR = UnresolvedReductions.begin(), ER = UnresolvedReductions.end();
bool FirstIter = true;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "nullptr expr in OpenMP reduction clause.");
// OpenMP [2.1, C/C++]
// A list item is a variable or array section, subject to the restrictions
// specified in Section 2.4 on page 42 and in each of the sections
// describing clauses and directives for which a list appears.
// OpenMP [, Restrictions, p.1]
// A variable that is part of another variable (as an array or
// structure element) cannot appear in a private clause.
if (!FirstIter && IR != ER)
FirstIter = false;
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange,
if (Res.second) {
// Try to find 'declare reduction' corresponding construct before using
// builtin/overloaded operators.
QualType Type = Context.DependentTy;
CXXCastPath BasePath;
ExprResult DeclareReductionRef = buildDeclareReductionRef(
S, ELoc, ERange, Stack->getCurScope(), ReductionIdScopeSpec,
ReductionId, Type, BasePath, IR == ER ? nullptr : *IR);
Expr *ReductionOp = nullptr;
if (S.CurContext->isDependentContext() &&
(DeclareReductionRef.isUnset() ||
ReductionOp = DeclareReductionRef.get();
// It will be analyzed later.
RD.push(RefExpr, ReductionOp);
ValueDecl *D = Res.first;
if (!D)
Expr *TaskgroupDescriptor = nullptr;
QualType Type;
auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr->IgnoreParens());
auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr->IgnoreParens());
if (ASE) {
Type = ASE->getType().getNonReferenceType();
} else if (OASE) {
QualType BaseType =
if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
Type = ATy->getElementType();
Type = BaseType->getPointeeType();
Type = Type.getNonReferenceType();
} else {
Type = Context.getBaseElementType(D->getType().getNonReferenceType());
auto *VD = dyn_cast<VarDecl>(D);
// OpenMP [, Restrictions, C/C++, p.3]
// A variable that appears in a private clause must not have an incomplete
// type or a reference type.
if (S.RequireCompleteType(ELoc, D->getType(),
// OpenMP [, reduction clause, Restrictions]
// A list item that appears in a reduction clause must not be
// const-qualified.
if (rejectConstNotMutableType(S, D, Type, ClauseKind, ELoc,
/*AcceptIfMutable*/ false, ASE || OASE))
OpenMPDirectiveKind CurrDir = Stack->getCurrentDirective();
// OpenMP [, Restrictions, C/C++, p.4]
// If a list-item is a reference type then it must bind to the same object
// for all threads of the team.
if (!ASE && !OASE) {
if (VD) {
VarDecl *VDDef = VD->getDefinition();
if (VD->getType()->isReferenceType() && VDDef && VDDef->hasInit()) {
DSARefChecker Check(Stack);
if (Check.Visit(VDDef->getInit())) {
S.Diag(ELoc, diag::err_omp_reduction_ref_type_arg)
<< getOpenMPClauseName(ClauseKind) << ERange;
S.Diag(VDDef->getLocation(), diag::note_defined_here) << VDDef;
// OpenMP [, Data-sharing Attribute Rules for Variables Referenced
// in a Construct]
// Variables with the predetermined data-sharing attributes may not be
// listed in data-sharing attributes clauses, except for the cases
// listed below. For these exceptions only, listing a predetermined
// variable in a data-sharing attribute clause is allowed and overrides
// the variable's predetermined data-sharing attributes.
// OpenMP [, Restrictions, p.3]
// Any number of reduction clauses can be specified on the directive,
// but a list item can appear only once in the reduction clauses for that
// directive.
DSAStackTy::DSAVarData DVar = Stack->getTopDSA(D, /*FromParent=*/false);
if (DVar.CKind == OMPC_reduction) {
S.Diag(ELoc, diag::err_omp_once_referenced)
<< getOpenMPClauseName(ClauseKind);
if (DVar.RefExpr)
S.Diag(DVar.RefExpr->getExprLoc(), diag::note_omp_referenced);
if (DVar.CKind != OMPC_unknown) {
S.Diag(ELoc, diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_reduction);
reportOriginalDsa(S, Stack, D, DVar);
// OpenMP [, Restrictions, p.1]
// A list item that appears in a reduction clause of a worksharing
// construct must be shared in the parallel regions to which any of the
// worksharing regions arising from the worksharing construct bind.
if (isOpenMPWorksharingDirective(CurrDir) &&
!isOpenMPParallelDirective(CurrDir) &&
!isOpenMPTeamsDirective(CurrDir)) {
DVar = Stack->getImplicitDSA(D, true);
if (DVar.CKind != OMPC_shared) {
S.Diag(ELoc, diag::err_omp_required_access)
<< getOpenMPClauseName(OMPC_reduction)
<< getOpenMPClauseName(OMPC_shared);
reportOriginalDsa(S, Stack, D, DVar);
// Try to find 'declare reduction' corresponding construct before using
// builtin/overloaded operators.
CXXCastPath BasePath;
ExprResult DeclareReductionRef = buildDeclareReductionRef(
S, ELoc, ERange, Stack->getCurScope(), ReductionIdScopeSpec,
ReductionId, Type, BasePath, IR == ER ? nullptr : *IR);
if (DeclareReductionRef.isInvalid())
if (S.CurContext->isDependentContext() &&
(DeclareReductionRef.isUnset() ||
isa<UnresolvedLookupExpr>(DeclareReductionRef.get()))) {
RD.push(RefExpr, DeclareReductionRef.get());
if (BOK == BO_Comma && DeclareReductionRef.isUnset()) {
// Not allowed reduction identifier is found.
<< Type << ReductionIdRange;
// OpenMP [, reduction clause, Restrictions]
// The type of a list item that appears in a reduction clause must be valid
// for the reduction-identifier. For a max or min reduction in C, the type
// of the list item must be an allowed arithmetic data type: char, int,
// float, double, or _Bool, possibly modified with long, short, signed, or
// unsigned. For a max or min reduction in C++, the type of the list item
// must be an allowed arithmetic data type: char, wchar_t, int, float,
// double, or bool, possibly modified with long, short, signed, or unsigned.
if (DeclareReductionRef.isUnset()) {
if ((BOK == BO_GT || BOK == BO_LT) &&
!(Type->isScalarType() ||
(S.getLangOpts().CPlusPlus && Type->isArithmeticType()))) {
S.Diag(ELoc, diag::err_omp_clause_not_arithmetic_type_arg)
<< getOpenMPClauseName(ClauseKind) << S.getLangOpts().CPlusPlus;
if (!ASE && !OASE) {
bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
if ((BOK == BO_OrAssign || BOK == BO_AndAssign || BOK == BO_XorAssign) &&
!S.getLangOpts().CPlusPlus && Type->isFloatingType()) {
S.Diag(ELoc, diag::err_omp_clause_floating_type_arg)
<< getOpenMPClauseName(ClauseKind);
if (!ASE && !OASE) {
bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
Type = Type.getNonLValueExprType(Context).getUnqualifiedType();
VarDecl *LHSVD = buildVarDecl(S, ELoc, Type, ".reduction.lhs",
D->hasAttrs() ? &D->getAttrs() : nullptr);
VarDecl *RHSVD = buildVarDecl(S, ELoc, Type, D->getName(),
D->hasAttrs() ? &D->getAttrs() : nullptr);
QualType PrivateTy = Type;
// Try if we can determine constant lengths for all array sections and avoid
// the VLA.
bool ConstantLengthOASE = false;
if (OASE) {
bool SingleElement;
llvm::SmallVector<llvm::APSInt, 4> ArraySizes;
ConstantLengthOASE = checkOMPArraySectionConstantForReduction(
Context, OASE, SingleElement, ArraySizes);
// If we don't have a single element, we must emit a constant array type.
if (ConstantLengthOASE && !SingleElement) {
for (llvm::APSInt &Size : ArraySizes)
PrivateTy = Context.getConstantArrayType(PrivateTy, Size, nullptr,
if ((OASE && !ConstantLengthOASE) ||
(!OASE && !ASE &&
D->getType().getNonReferenceType()->isVariablyModifiedType())) {
if (!Context.getTargetInfo().isVLASupported()) {
if (isOpenMPTargetExecutionDirective(Stack->getCurrentDirective())) {
S.Diag(ELoc, diag::err_omp_reduction_vla_unsupported) << !!OASE;
S.Diag(ELoc, diag::note_vla_unsupported);
} else {
S.targetDiag(ELoc, diag::err_omp_reduction_vla_unsupported) << !!OASE;
S.targetDiag(ELoc, diag::note_vla_unsupported);
// For arrays/array sections only:
// Create pseudo array type for private copy. The size for this array will
// be generated during codegen.
// For array subscripts or single variables Private Ty is the same as Type
// (type of the variable or single array element).
PrivateTy = Context.getVariableArrayType(
new (Context) OpaqueValueExpr(ELoc, Context.getSizeType(), VK_RValue),
ArrayType::Normal, /*IndexTypeQuals=*/0, SourceRange());
} else if (!ASE && !OASE &&
Context.getAsArrayType(D->getType().getNonReferenceType())) {
PrivateTy = D->getType().getNonReferenceType();
// Private copy.
VarDecl *PrivateVD =
buildVarDecl(S, ELoc, PrivateTy, D->getName(),
D->hasAttrs() ? &D->getAttrs() : nullptr,
VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
// Add initializer for private variable.
Expr *Init = nullptr;
DeclRefExpr *LHSDRE = buildDeclRefExpr(S, LHSVD, Type, ELoc);
DeclRefExpr *RHSDRE = buildDeclRefExpr(S, RHSVD, Type, ELoc);
if (DeclareReductionRef.isUsable()) {
auto *DRDRef = DeclareReductionRef.getAs<DeclRefExpr>();
auto *DRD = cast<OMPDeclareReductionDecl>(DRDRef->getDecl());
if (DRD->getInitializer()) {
Init = DRDRef;
} else {
switch (BOK) {
case BO_Add:
case BO_Xor:
case BO_Or:
case BO_LOr:
// '+', '-', '^', '|', '||' reduction ops - initializer is '0'.
if (Type->isScalarType() || Type->isAnyComplexType())
Init = S.ActOnIntegerConstant(ELoc, /*Val=*/0).get();
case BO_Mul:
case BO_LAnd:
if (Type->isScalarType() || Type->isAnyComplexType()) {
// '*' and '&&' reduction ops - initializer is '1'.
Init = S.ActOnIntegerConstant(ELoc, /*Val=*/1).get();
case BO_And: {
// '&' reduction op - initializer is '~0'.
QualType OrigType = Type;
if (auto *ComplexTy = OrigType->getAs<ComplexType>())
Type = ComplexTy->getElementType();
if (Type->isRealFloatingType()) {
llvm::APFloat InitValue = llvm::APFloat::getAllOnesValue(
Init = FloatingLiteral::Create(Context, InitValue, /*isexact=*/true,
Type, ELoc);
} else if (Type->isScalarType()) {
uint64_t Size = Context.getTypeSize(Type);
QualType IntTy = Context.getIntTypeForBitwidth(Size, /*Signed=*/0);
llvm::APInt InitValue = llvm::APInt::getAllOnesValue(Size);
Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
if (Init && OrigType->isAnyComplexType()) {
// Init = 0xFFFF + 0xFFFFi;
auto *Im = new (Context) ImaginaryLiteral(Init, OrigType);
Init = S.CreateBuiltinBinOp(ELoc, BO_Add, Init, Im).get();
Type = OrigType;
case BO_LT:
case BO_GT: {
// 'min' reduction op - initializer is 'Largest representable number in
// the reduction list item type'.
// 'max' reduction op - initializer is 'Least representable number in
// the reduction list item type'.
if (Type->isIntegerType() || Type->isPointerType()) {
bool IsSigned = Type->hasSignedIntegerRepresentation();
uint64_t Size = Context.getTypeSize(Type);
QualType IntTy =
Context.getIntTypeForBitwidth(Size, /*Signed=*/IsSigned);
llvm::APInt InitValue =
(BOK != BO_LT) ? IsSigned ? llvm::APInt::getSignedMinValue(Size)
: llvm::APInt::getMinValue(Size)
: IsSigned ? llvm::APInt::getSignedMaxValue(Size)
: llvm::APInt::getMaxValue(Size);
Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
if (Type->isPointerType()) {
// Cast to pointer type.
ExprResult CastExpr = S.BuildCStyleCastExpr(
ELoc, Context.getTrivialTypeSourceInfo(Type, ELoc), ELoc, Init);
if (CastExpr.isInvalid())
Init = CastExpr.get();
} else if (Type->isRealFloatingType()) {
llvm::APFloat InitValue = llvm::APFloat::getLargest(
Context.getFloatTypeSemantics(Type), BOK != BO_LT);
Init = FloatingLiteral::Create(Context, InitValue, /*isexact=*/true,
Type, ELoc);
case BO_PtrMemD:
case BO_PtrMemI:
case BO_MulAssign:
case BO_Div:
case BO_Rem:
case BO_Sub:
case BO_Shl:
case BO_Shr:
case BO_LE:
case BO_GE:
case BO_EQ:
case BO_NE:
case BO_Cmp:
case BO_AndAssign:
case BO_XorAssign:
case BO_OrAssign:
case BO_Assign:
case BO_AddAssign:
case BO_SubAssign:
case BO_DivAssign:
case BO_RemAssign:
case BO_ShlAssign:
case BO_ShrAssign:
case BO_Comma:
llvm_unreachable("Unexpected reduction operation");
if (Init && DeclareReductionRef.isUnset()) {
S.AddInitializerToDecl(RHSVD, Init, /*DirectInit=*/false);
// Store initializer for single element in private copy. Will be used
// during codegen.
} else if (!Init) {
// Store initializer for single element in private copy. Will be used
// during codegen.
if (RHSVD->isInvalidDecl())
if (!RHSVD->hasInit() &&
(DeclareReductionRef.isUnset() || !S.LangOpts.CPlusPlus)) {
S.Diag(ELoc, diag::err_omp_reduction_id_not_compatible)
<< Type << ReductionIdRange;
bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
DeclRefExpr *PrivateDRE = buildDeclRefExpr(S, PrivateVD, PrivateTy, ELoc);
ExprResult ReductionOp;
if (DeclareReductionRef.isUsable()) {
QualType RedTy = DeclareReductionRef.get()->getType();
QualType PtrRedTy = Context.getPointerType(RedTy);
ExprResult LHS = S.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, LHSDRE);
ExprResult RHS = S.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, RHSDRE);
if (!BasePath.empty()) {
LHS = S.DefaultLvalueConversion(LHS.get());
RHS = S.DefaultLvalueConversion(RHS.get());
LHS = ImplicitCastExpr::Create(Context, PtrRedTy,
CK_UncheckedDerivedToBase, LHS.get(),
&BasePath, LHS.get()->getValueKind());
RHS = ImplicitCastExpr::Create(Context, PtrRedTy,
CK_UncheckedDerivedToBase, RHS.get(),
&BasePath, RHS.get()->getValueKind());
FunctionProtoType::ExtProtoInfo EPI;
QualType Params[] = {PtrRedTy, PtrRedTy};
QualType FnTy = Context.getFunctionType(Context.VoidTy, Params, EPI);
auto *OVE = new (Context) OpaqueValueExpr(
ELoc, Context.getPointerType(FnTy), VK_RValue, OK_Ordinary,
Expr *Args[] = {LHS.get(), RHS.get()};
ReductionOp =
CallExpr::Create(Context, OVE, Args, Context.VoidTy, VK_RValue, ELoc);
} else {
ReductionOp = S.BuildBinOp(
Stack->getCurScope(), ReductionId.getBeginLoc(), BOK, LHSDRE, RHSDRE);
if (ReductionOp.isUsable()) {
if (BOK != BO_LT && BOK != BO_GT) {
ReductionOp =
S.BuildBinOp(Stack->getCurScope(), ReductionId.getBeginLoc(),
BO_Assign, LHSDRE, ReductionOp.get());
} else {
auto *ConditionalOp = new (Context)
ConditionalOperator(ReductionOp.get(), ELoc, LHSDRE, ELoc, RHSDRE,
Type, VK_LValue, OK_Ordinary);
ReductionOp =
S.BuildBinOp(Stack->getCurScope(), ReductionId.getBeginLoc(),
BO_Assign, LHSDRE, ConditionalOp);
if (ReductionOp.isUsable())
ReductionOp = S.ActOnFinishFullExpr(ReductionOp.get(),
/*DiscardedValue*/ false);
if (!ReductionOp.isUsable())
// Add copy operations for inscan reductions.
// LHS = RHS;
ExprResult CopyOpRes, TempArrayRes, TempArrayElem;
if (ClauseKind == OMPC_reduction &&
RD.RedModifier == OMPC_REDUCTION_inscan) {
ExprResult RHS = S.DefaultLvalueConversion(RHSDRE);
CopyOpRes = S.BuildBinOp(Stack->getCurScope(), ELoc, BO_Assign, LHSDRE,
if (!CopyOpRes.isUsable())
CopyOpRes =
S.ActOnFinishFullExpr(CopyOpRes.get(), /*DiscardedValue=*/true);
if (!CopyOpRes.isUsable())
// For simd directive and simd-based directives in simd mode no need to
// construct temp array, need just a single temp element.
if (Stack->getCurrentDirective() == OMPD_simd ||
(S.getLangOpts().OpenMPSimd &&
isOpenMPSimdDirective(Stack->getCurrentDirective()))) {
VarDecl *TempArrayVD =
buildVarDecl(S, ELoc, PrivateTy, D->getName(),
D->hasAttrs() ? &D->getAttrs() : nullptr);
// Add a constructor to the temp decl.
TempArrayRes = buildDeclRefExpr(S, TempArrayVD, PrivateTy, ELoc);
} else {
// Build temp array for prefix sum.
auto *Dim = new (S.Context)
OpaqueValueExpr(ELoc, S.Context.getSizeType(), VK_RValue);
QualType ArrayTy =
S.Context.getVariableArrayType(PrivateTy, Dim, ArrayType::Normal,
/*IndexTypeQuals=*/0, {ELoc, ELoc});
VarDecl *TempArrayVD =
buildVarDecl(S, ELoc, ArrayTy, D->getName(),
D->hasAttrs() ? &D->getAttrs() : nullptr);
// Add a constructor to the temp decl.
TempArrayRes = buildDeclRefExpr(S, TempArrayVD, ArrayTy, ELoc);
TempArrayElem =
auto *Idx = new (S.Context)
OpaqueValueExpr(ELoc, S.Context.getSizeType(), VK_RValue);
TempArrayElem = S.CreateBuiltinArraySubscriptExpr(TempArrayElem.get(),
ELoc, Idx, ELoc);
// OpenMP [, Restrictions, p.2]
// A list item that appears in an in_reduction clause of a task construct
// must appear in a task_reduction clause of a construct associated with a
// taskgroup region that includes the participating task in its taskgroup
// set. The construct associated with the innermost region that meets this
// condition must specify the same reduction-identifier as the in_reduction
// clause.
if (ClauseKind == OMPC_in_reduction) {
SourceRange ParentSR;
BinaryOperatorKind ParentBOK;
const Expr *ParentReductionOp = nullptr;
Expr *ParentBOKTD = nullptr, *ParentReductionOpTD = nullptr;
DSAStackTy::DSAVarData ParentBOKDSA =
Stack->getTopMostTaskgroupReductionData(D, ParentSR, ParentBOK,
DSAStackTy::DSAVarData ParentReductionOpDSA =
D, ParentSR, ParentReductionOp, ParentReductionOpTD);
bool IsParentBOK = ParentBOKDSA.DKind != OMPD_unknown;
bool IsParentReductionOp = ParentReductionOpDSA.DKind != OMPD_unknown;
if ((DeclareReductionRef.isUnset() && IsParentReductionOp) ||
(DeclareReductionRef.isUsable() && IsParentBOK) ||
(IsParentBOK && BOK != ParentBOK) || IsParentReductionOp) {
bool EmitError = true;
if (IsParentReductionOp && DeclareReductionRef.isUsable()) {
llvm::FoldingSetNodeID RedId, ParentRedId;
ParentReductionOp->Profile(ParentRedId, Context, /*Canonical=*/true);
DeclareReductionRef.get()->Profile(RedId, Context,
EmitError = RedId != ParentRedId;
if (EmitError) {
<< ReductionIdRange << RefExpr->getSourceRange();
<< ParentSR
<< (IsParentBOK ? ParentBOKDSA.RefExpr
: ParentReductionOpDSA.RefExpr)
TaskgroupDescriptor = IsParentBOK ? ParentBOKTD : ParentReductionOpTD;
DeclRefExpr *Ref = nullptr;
Expr *VarsExpr = RefExpr->IgnoreParens();
if (!VD && !S.CurContext->isDependentContext()) {
if (ASE || OASE) {
TransformExprToCaptures RebuildToCapture(S, D);
VarsExpr =
Ref = RebuildToCapture.getCapturedExpr();
} else {
VarsExpr = Ref = buildCapture(S, D, SimpleRefExpr, /*WithInit=*/false);
if (!S.isOpenMPCapturedDecl(D)) {
if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
ExprResult RefRes = S.DefaultLvalueConversion(Ref);
if (!RefRes.isUsable())
ExprResult PostUpdateRes =
S.BuildBinOp(Stack->getCurScope(), ELoc, BO_Assign, SimpleRefExpr,
if (!PostUpdateRes.isUsable())
if (isOpenMPTaskingDirective(Stack->getCurrentDirective()) ||
Stack->getCurrentDirective() == OMPD_taskgroup) {
<< RefExpr->getSourceRange();
// All reduction items are still marked as reduction (to do not increase
// code base size).
unsigned Modifier = RD.RedModifier;
// Consider task_reductions as reductions with task modifier. Required for
// correct analysis of in_reduction clauses.
if (CurrDir == OMPD_taskgroup && ClauseKind == OMPC_task_reduction)
Modifier = OMPC_REDUCTION_task;
Stack->addDSA(D, RefExpr->IgnoreParens(), OMPC_reduction, Ref, Modifier);
if (Modifier == OMPC_REDUCTION_task &&
(CurrDir == OMPD_taskgroup ||
((isOpenMPParallelDirective(CurrDir) ||
isOpenMPWorksharingDirective(CurrDir)) &&
!isOpenMPSimdDirective(CurrDir)))) {
if (DeclareReductionRef.isUsable())
Stack->addTaskgroupReductionData(D, ReductionIdRange,
Stack->addTaskgroupReductionData(D, ReductionIdRange, BOK);
RD.push(VarsExpr, PrivateDRE, LHSDRE, RHSDRE, ReductionOp.get(),
TaskgroupDescriptor, CopyOpRes.get(), TempArrayRes.get(),
return RD.Vars.empty();
OMPClause *Sema::ActOnOpenMPReductionClause(
ArrayRef<Expr *> VarList, OpenMPReductionClauseModifier Modifier,
SourceLocation StartLoc, SourceLocation LParenLoc,
SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
ArrayRef<Expr *> UnresolvedReductions) {
if (ModifierLoc.isValid() && Modifier == OMPC_REDUCTION_unknown) {
Diag(LParenLoc, diag::err_omp_unexpected_clause_value)
<< getListOfPossibleValues(OMPC_reduction, /*First=*/0,
<< getOpenMPClauseName(OMPC_reduction);
return nullptr;
// OpenMP 5.0, reduction Clause, Restrictions
// A reduction clause with the inscan reduction-modifier may only appear on a
// worksharing-loop construct, a worksharing-loop SIMD construct, a simd
// construct, a parallel worksharing-loop construct or a parallel
// worksharing-loop SIMD construct.
if (Modifier == OMPC_REDUCTION_inscan &&
(DSAStack->getCurrentDirective() != OMPD_for &&
DSAStack->getCurrentDirective() != OMPD_for_simd &&
DSAStack->getCurrentDirective() != OMPD_simd &&
DSAStack->getCurrentDirective() != OMPD_parallel_for &&
DSAStack->getCurrentDirective() != OMPD_parallel_for_simd)) {
Diag(ModifierLoc, diag::err_omp_wrong_inscan_reduction);
return nullptr;
ReductionData RD(VarList.size(), Modifier);
if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_reduction, VarList,
StartLoc, LParenLoc, ColonLoc, EndLoc,
ReductionIdScopeSpec, ReductionId,
UnresolvedReductions, RD))
return nullptr;
return OMPReductionClause::Create(
Context, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, Modifier,
RD.Vars, ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.InscanCopyOps,
RD.InscanCopyArrayTemps, RD.InscanCopyArrayElems,
buildPreInits(Context, RD.ExprCaptures),
buildPostUpdate(*this, RD.ExprPostUpdates));
OMPClause *Sema::ActOnOpenMPTaskReductionClause(
ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
SourceLocation ColonLoc, SourceLocation EndLoc,
CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
ArrayRef<Expr *> UnresolvedReductions) {
ReductionData RD(VarList.size());
if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_task_reduction, VarList,
StartLoc, LParenLoc, ColonLoc, EndLoc,
ReductionIdScopeSpec, ReductionId,
UnresolvedReductions, RD))
return nullptr;
return OMPTaskReductionClause::Create(
Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps,
buildPreInits(Context, RD.ExprCaptures),
buildPostUpdate(*this, RD.ExprPostUpdates));
OMPClause *Sema::ActOnOpenMPInReductionClause(
ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
SourceLocation ColonLoc, SourceLocation EndLoc,
CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
ArrayRef<Expr *> UnresolvedReductions) {
ReductionData RD(VarList.size());
if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_in_reduction, VarList,
StartLoc, LParenLoc, ColonLoc, EndLoc,
ReductionIdScopeSpec, ReductionId,
UnresolvedReductions, RD))
return nullptr;
return OMPInReductionClause::Create(
Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.TaskgroupDescriptors,
buildPreInits(Context, RD.ExprCaptures),
buildPostUpdate(*this, RD.ExprPostUpdates));
bool Sema::CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
SourceLocation LinLoc) {
if ((!LangOpts.CPlusPlus && LinKind != OMPC_LINEAR_val) ||
LinKind == OMPC_LINEAR_unknown) {
Diag(LinLoc, diag::err_omp_wrong_linear_modifier) << LangOpts.CPlusPlus;
return true;
return false;
bool Sema::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc,
OpenMPLinearClauseKind LinKind, QualType Type,
bool IsDeclareSimd) {
const auto *VD = dyn_cast_or_null<VarDecl>(D);
// A variable must not have an incomplete type or a reference type.
if (RequireCompleteType(ELoc, Type, diag::err_omp_linear_incomplete_type))
return true;
if ((LinKind == OMPC_LINEAR_uval || LinKind == OMPC_LINEAR_ref) &&
!Type->isReferenceType()) {
Diag(ELoc, diag::err_omp_wrong_linear_modifier_non_reference)
<< Type << getOpenMPSimpleClauseTypeName(OMPC_linear, LinKind);
return true;
Type = Type.getNonReferenceType();
// OpenMP 5.0 [2.19.3, List Item Privatization, Restrictions]
// A variable that is privatized must not have a const-qualified type
// unless it is of class type with a mutable member. This restriction does
// not apply to the firstprivate clause, nor to the linear clause on
// declarative directives (like declare simd).
if (!IsDeclareSimd &&
rejectConstNotMutableType(*this, D, Type, OMPC_linear, ELoc))
return true;
// A list item must be of integral or pointer type.
Type = Type.getUnqualifiedType().getCanonicalType();
const auto *Ty = Type.getTypePtrOrNull();
if (!Ty || (LinKind != OMPC_LINEAR_ref && !Ty->isDependentType() &&
!Ty->isIntegralType(Context) && !Ty->isPointerType())) {
Diag(ELoc, diag::err_omp_linear_expected_int_or_ptr) << Type;
if (D) {
bool IsDecl =
!VD ||
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
return true;
return false;
OMPClause *Sema::ActOnOpenMPLinearClause(
ArrayRef<Expr *> VarList, Expr *Step, SourceLocation StartLoc,
SourceLocation LParenLoc, OpenMPLinearClauseKind LinKind,
SourceLocation LinLoc, SourceLocation ColonLoc, SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
SmallVector<Expr *, 8> Privates;
SmallVector<Expr *, 8> Inits;
SmallVector<Decl *, 4> ExprCaptures;
SmallVector<Expr *, 4> ExprPostUpdates;
if (CheckOpenMPLinearModifier(LinKind, LinLoc))
LinKind = OMPC_LINEAR_val;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP linear clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
QualType Type = D->getType();
auto *VD = dyn_cast<VarDecl>(D);
// OpenMP [, linear clause]
// A list-item cannot appear in more than one linear clause.
// A list-item that appears in a linear clause cannot appear in any
// other data-sharing attribute clause.
DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /*FromParent=*/false);
if (DVar.RefExpr) {
Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_linear);
reportOriginalDsa(*this, DSAStack, D, DVar);
if (CheckOpenMPLinearDecl(D, ELoc, LinKind, Type))
Type = Type.getNonReferenceType().getUnqualifiedType().getCanonicalType();
// Build private copy of original var.
VarDecl *Private =
buildVarDecl(*this, ELoc, Type, D->getName(),
D->hasAttrs() ? &D->getAttrs() : nullptr,
VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
DeclRefExpr *PrivateRef = buildDeclRefExpr(*this, Private, Type, ELoc);
// Build var to save initial value.
VarDecl *Init = buildVarDecl(*this, ELoc, Type, ".linear.start");
Expr *InitExpr;
DeclRefExpr *Ref = nullptr;
if (!VD && !CurContext->isDependentContext()) {
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
if (!isOpenMPCapturedDecl(D)) {
if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
ExprResult RefRes = DefaultLvalueConversion(Ref);
if (!RefRes.isUsable())
ExprResult PostUpdateRes =
BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
SimpleRefExpr, RefRes.get());
if (!PostUpdateRes.isUsable())
if (LinKind == OMPC_LINEAR_uval)
InitExpr = VD ? VD->getInit() : SimpleRefExpr;
InitExpr = VD ? SimpleRefExpr : Ref;
AddInitializerToDecl(Init, DefaultLvalueConversion(InitExpr).get(),
DeclRefExpr *InitRef = buildDeclRefExpr(*this, Init, Type, ELoc);
DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_linear, Ref);
Vars.push_back((VD || CurContext->isDependentContext())
? RefExpr->IgnoreParens()
: Ref);
if (Vars.empty())
return nullptr;
Expr *StepExpr = Step;
Expr *CalcStepExpr = nullptr;
if (Step && !Step->isValueDependent() && !Step->isTypeDependent() &&
!Step->isInstantiationDependent() &&
!Step->containsUnexpandedParameterPack()) {
SourceLocation StepLoc = Step->getBeginLoc();
ExprResult Val = PerformOpenMPImplicitIntegerConversion(StepLoc, Step);
if (Val.isInvalid())
return nullptr;
StepExpr = Val.get();
// Build var to save the step value.
VarDecl *SaveVar =
buildVarDecl(*this, StepLoc, StepExpr->getType(), ".linear.step");
ExprResult SaveRef =
buildDeclRefExpr(*this, SaveVar, StepExpr->getType(), StepLoc);
ExprResult CalcStep =
BuildBinOp(CurScope, StepLoc, BO_Assign, SaveRef.get(), StepExpr);
CalcStep = ActOnFinishFullExpr(CalcStep.get(), /*DiscardedValue*/ false);
// Warn about zero linear step (it would be probably better specified as
// making corresponding variables 'const').
llvm::APSInt Result;
bool IsConstant = StepExpr->isIntegerConstantExpr(Result, Context);
if (IsConstant && !Result.isNegative() && !Result.isStrictlyPositive())
Diag(StepLoc, diag::warn_omp_linear_step_zero) << Vars[0]
<< (Vars.size() > 1);
if (!IsConstant && CalcStep.isUsable()) {
// Calculate the step beforehand instead of doing this on each iteration.
// (This is not used if the number of iterations may be kfold-ed).
CalcStepExpr = CalcStep.get();
return OMPLinearClause::Create(Context, StartLoc, LParenLoc, LinKind, LinLoc,
ColonLoc, EndLoc, Vars, Privates, Inits,
StepExpr, CalcStepExpr,
buildPreInits(Context, ExprCaptures),
buildPostUpdate(*this, ExprPostUpdates));
static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
Expr *NumIterations, Sema &SemaRef,
Scope *S, DSAStackTy *Stack) {
// Walk the vars and build update/final expressions for the CodeGen.
SmallVector<Expr *, 8> Updates;
SmallVector<Expr *, 8> Finals;
SmallVector<Expr *, 8> UsedExprs;
Expr *Step = Clause.getStep();
Expr *CalcStep = Clause.getCalcStep();
// OpenMP [, linear clause]
// If linear-step is not specified it is assumed to be 1.
if (!Step)
Step = SemaRef.ActOnIntegerConstant(SourceLocation(), 1).get();
else if (CalcStep)
Step = cast<BinaryOperator>(CalcStep)->getLHS();
bool HasErrors = false;
auto CurInit = Clause.inits().begin();
auto CurPrivate = Clause.privates().begin();
OpenMPLinearClauseKind LinKind = Clause.getModifier();
for (Expr *RefExpr : Clause.varlists()) {
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
ValueDecl *D = Res.first;
if (Res.second || !D) {
HasErrors = true;
auto &&Info = Stack->isLoopControlVariable(D);
// OpenMP [2.15.11, distribute simd Construct]
// A list item may not appear in a linear clause, unless it is the loop
// iteration variable.
if (isOpenMPDistributeDirective(Stack->getCurrentDirective()) &&
isOpenMPSimdDirective(Stack->getCurrentDirective()) && !Info.first) {
HasErrors = true;
Expr *InitExpr = *CurInit;
// Build privatized reference to the current linear var.
auto *DE = cast<DeclRefExpr>(SimpleRefExpr);
Expr *CapturedRef;
if (LinKind == OMPC_LINEAR_uval)
CapturedRef = cast<VarDecl>(DE->getDecl())->getInit();
CapturedRef =
buildDeclRefExpr(SemaRef, cast<VarDecl>(DE->getDecl()),
DE->getType().getUnqualifiedType(), DE->getExprLoc(),
// Build update: Var = InitExpr + IV * Step
ExprResult Update;
if (!Info.first)
Update = buildCounterUpdate(
SemaRef, S, RefExpr->getExprLoc(), *CurPrivate, InitExpr, IV, Step,
/*Subtract=*/false, /*IsNonRectangularLB=*/false);
Update = *CurPrivate;
Update = SemaRef.ActOnFinishFullExpr(Update.get(), DE->getBeginLoc(),
/*DiscardedValue*/ false);
// Build final: Var = InitExpr + NumIterations * Step
ExprResult Final;
if (!Info.first)
Final =
buildCounterUpdate(SemaRef, S, RefExpr->getExprLoc(), CapturedRef,
InitExpr, NumIterations, Step, /*Subtract=*/false,
Final = *CurPrivate;
Final = SemaRef.ActOnFinishFullExpr(Final.get(), DE->getBeginLoc(),
/*DiscardedValue*/ false);
if (!Update.isUsable() || !Final.isUsable()) {
HasErrors = true;
} else {
if (!Info.first)
if (Expr *S = Clause.getStep())
// Fill the remaining part with the nullptr.
UsedExprs.append(Clause.varlist_size() + 1 - UsedExprs.size(), nullptr);
return HasErrors;
OMPClause *Sema::ActOnOpenMPAlignedClause(
ArrayRef<Expr *> VarList, Expr *Alignment, SourceLocation StartLoc,
SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP linear clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
QualType QType = D->getType();
auto *VD = dyn_cast<VarDecl>(D);
// OpenMP [2.8.1, simd construct, Restrictions]
// The type of list items appearing in the aligned clause must be
// array, pointer, reference to array, or reference to pointer.
QType = QType.getNonReferenceType().getUnqualifiedType().getCanonicalType();
const Type *Ty = QType.getTypePtrOrNull();
if (!Ty || (!Ty->isArrayType() && !Ty->isPointerType())) {
Diag(ELoc, diag::err_omp_aligned_expected_array_or_ptr)
<< QType << getLangOpts().CPlusPlus << ERange;
bool IsDecl =
!VD ||
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
// OpenMP [2.8.1, simd construct, Restrictions]
// A list-item cannot appear in more than one aligned clause.
if (const Expr *PrevRef = DSAStack->addUniqueAligned(D, SimpleRefExpr)) {
Diag(ELoc, diag::err_omp_used_in_clause_twice)
<< 0 << getOpenMPClauseName(OMPC_aligned) << ERange;
Diag(PrevRef->getExprLoc(), diag::note_omp_explicit_dsa)
<< getOpenMPClauseName(OMPC_aligned);
DeclRefExpr *Ref = nullptr;
if (!VD && isOpenMPCapturedDecl(D))
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
(VD || !Ref) ? RefExpr->IgnoreParens() : Ref)
// OpenMP [2.8.1, simd construct, Description]
// The parameter of the aligned clause, alignment, must be a constant
// positive integer expression.
// If no optional parameter is specified, implementation-defined default
// alignments for SIMD instructions on the target platforms are assumed.
if (Alignment != nullptr) {
ExprResult AlignResult =
VerifyPositiveIntegerConstantInClause(Alignment, OMPC_aligned);
if (AlignResult.isInvalid())
return nullptr;
Alignment = AlignResult.get();
if (Vars.empty())
return nullptr;
return OMPAlignedClause::Create(Context, StartLoc, LParenLoc, ColonLoc,
EndLoc, Vars, Alignment);
OMPClause *Sema::ActOnOpenMPCopyinClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
SmallVector<Expr *, 8> SrcExprs;
SmallVector<Expr *, 8> DstExprs;
SmallVector<Expr *, 8> AssignmentOps;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP copyin clause.");
if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
// It will be analyzed later.
SourceLocation ELoc = RefExpr->getExprLoc();
// OpenMP [2.1, C/C++]
// A list item is a variable name.
// OpenMP [, Restrictions, p.1]
// A list item that appears in a copyin clause must be threadprivate.
auto *DE = dyn_cast<DeclRefExpr>(RefExpr);
if (!DE || !isa<VarDecl>(DE->getDecl())) {
Diag(ELoc, diag::err_omp_expected_var_name_member_expr)
<< 0 << RefExpr->getSourceRange();
Decl *D = DE->getDecl();
auto *VD = cast<VarDecl>(D);
QualType Type = VD->getType();
if (Type->isDependentType() || Type->isInstantiationDependentType()) {
// It will be analyzed later.
// OpenMP [, Restrictions, C/C++, p.1]
// A list item that appears in a copyin clause must be threadprivate.
if (!DSAStack->isThreadPrivate(VD)) {
Diag(ELoc, diag::err_omp_required_access)
<< getOpenMPClauseName(OMPC_copyin)
<< getOpenMPDirectiveName(OMPD_threadprivate);
// OpenMP [, Restrictions, C/C++, p.2]
// A variable of class type (or array thereof) that appears in a
// copyin clause requires an accessible, unambiguous copy assignment
// operator for the class type.
QualType ElemType = Context.getBaseElementType(Type).getNonReferenceType();
VarDecl *SrcVD =
buildVarDecl(*this, DE->getBeginLoc(), ElemType.getUnqualifiedType(),
".copyin.src", VD->hasAttrs() ? &VD->getAttrs() : nullptr);
DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr(
*this, SrcVD, ElemType.getUnqualifiedType(), DE->getExprLoc());
VarDecl *DstVD =
buildVarDecl(*this, DE->getBeginLoc(), ElemType, ".copyin.dst",
VD->hasAttrs() ? &VD->getAttrs() : nullptr);
DeclRefExpr *PseudoDstExpr =
buildDeclRefExpr(*this, DstVD, ElemType, DE->getExprLoc());
// For arrays generate assignment operation for single element and replace
// it by the original array element in CodeGen.
ExprResult AssignmentOp =
BuildBinOp(/*S=*/nullptr, DE->getExprLoc(), BO_Assign, PseudoDstExpr,
if (AssignmentOp.isInvalid())
AssignmentOp = ActOnFinishFullExpr(AssignmentOp.get(), DE->getExprLoc(),
/*DiscardedValue*/ false);
if (AssignmentOp.isInvalid())
DSAStack->addDSA(VD, DE, OMPC_copyin);
if (Vars.empty())
return nullptr;
return OMPCopyinClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars,
SrcExprs, DstExprs, AssignmentOps);
OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
SmallVector<Expr *, 8> SrcExprs;
SmallVector<Expr *, 8> DstExprs;
SmallVector<Expr *, 8> AssignmentOps;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP linear clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
QualType Type = D->getType();
auto *VD = dyn_cast<VarDecl>(D);
// OpenMP [, Restrictions, p.2]
// A list item that appears in a copyprivate clause may not appear in a
// private or firstprivate clause on the single construct.
if (!VD || !DSAStack->isThreadPrivate(VD)) {
DSAStackTy::DSAVarData DVar =
DSAStack->getTopDSA(D, /*FromParent=*/false);
if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_copyprivate &&
DVar.RefExpr) {
Diag(ELoc, diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_copyprivate);
reportOriginalDsa(*this, DSAStack, D, DVar);
// OpenMP [, Restrictions, p.1]
// All list items that appear in a copyprivate clause must be either
// threadprivate or private in the enclosing context.
if (DVar.CKind == OMPC_unknown) {
DVar = DSAStack->getImplicitDSA(D, false);
if (DVar.CKind == OMPC_shared) {
Diag(ELoc, diag::err_omp_required_access)
<< getOpenMPClauseName(OMPC_copyprivate)
<< "threadprivate or private in the enclosing context";
reportOriginalDsa(*this, DSAStack, D, DVar);
// Variably modified types are not supported.
if (!Type->isAnyPointerType() && Type->isVariablyModifiedType()) {
Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
<< getOpenMPClauseName(OMPC_copyprivate) << Type
<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
bool IsDecl =
!VD ||
VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
IsDecl ? diag::note_previous_decl : diag::note_defined_here)
<< D;
// OpenMP [, Restrictions, C/C++, p.2]
// A variable of class type (or array thereof) that appears in a
// copyin clause requires an accessible, unambiguous copy assignment
// operator for the class type.
Type = Context.getBaseElementType(Type.getNonReferenceType())
VarDecl *SrcVD =
buildVarDecl(*this, RefExpr->getBeginLoc(), Type, ".copyprivate.src",
D->hasAttrs() ? &D->getAttrs() : nullptr);
DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr(*this, SrcVD, Type, ELoc);
VarDecl *DstVD =
buildVarDecl(*this, RefExpr->getBeginLoc(), Type, ".copyprivate.dst",
D->hasAttrs() ? &D->getAttrs() : nullptr);
DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(*this, DstVD, Type, ELoc);
ExprResult AssignmentOp = BuildBinOp(
DSAStack->getCurScope(), ELoc, BO_Assign, PseudoDstExpr, PseudoSrcExpr);
if (AssignmentOp.isInvalid())
AssignmentOp =
ActOnFinishFullExpr(AssignmentOp.get(), ELoc, /*DiscardedValue*/ false);
if (AssignmentOp.isInvalid())
// No need to mark vars as copyprivate, they are already threadprivate or
// implicitly private.
assert(VD || isOpenMPCapturedDecl(D));
VD ? RefExpr->IgnoreParens()
: buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false));
if (Vars.empty())
return nullptr;
return OMPCopyprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
Vars, SrcExprs, DstExprs, AssignmentOps);
OMPClause *Sema::ActOnOpenMPFlushClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
if (VarList.empty())
return nullptr;
return OMPFlushClause::Create(Context, StartLoc, LParenLoc, EndLoc, VarList);
/// Tries to find omp_depend_t. type.
static bool findOMPDependT(Sema &S, SourceLocation Loc, DSAStackTy *Stack,
bool Diagnose = true) {
QualType OMPDependT = Stack->getOMPDependT();
if (!OMPDependT.isNull())
return true;
IdentifierInfo *II = &S.PP.getIdentifierTable().get("omp_depend_t");
ParsedType PT = S.getTypeName(*II, Loc, S.getCurScope());
if (!PT.getAsOpaquePtr() || PT.get().isNull()) {
if (Diagnose)
S.Diag(Loc, diag::err_omp_implied_type_not_found) << "omp_depend_t";
return false;
return true;
OMPClause *Sema::ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
if (!Depobj)
return nullptr;
bool OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack);
// OpenMP 5.0, depobj Construct
// depobj is an lvalue expression of type omp_depend_t.
if (!Depobj->isTypeDependent() && !Depobj->isValueDependent() &&
!Depobj->isInstantiationDependent() &&
!Depobj->containsUnexpandedParameterPack() &&
(OMPDependTFound &&
!Context.typesAreCompatible(DSAStack->getOMPDependT(), Depobj->getType(),
/*CompareUnqualified=*/true))) {
Diag(Depobj->getExprLoc(), diag::err_omp_expected_omp_depend_t_lvalue)
<< 0 << Depobj->getType() << Depobj->getSourceRange();
if (!Depobj->isLValue()) {
Diag(Depobj->getExprLoc(), diag::err_omp_expected_omp_depend_t_lvalue)
<< 1 << Depobj->getSourceRange();
return OMPDepobjClause::Create(Context, StartLoc, LParenLoc, EndLoc, Depobj);
OMPClause *
Sema::ActOnOpenMPDependClause(Expr *DepModifier, OpenMPDependClauseKind DepKind,
SourceLocation DepLoc, SourceLocation ColonLoc,
ArrayRef<Expr *> VarList, SourceLocation StartLoc,
SourceLocation LParenLoc, SourceLocation EndLoc) {
if (DSAStack->getCurrentDirective() == OMPD_ordered &&
DepKind != OMPC_DEPEND_source && DepKind != OMPC_DEPEND_sink) {
Diag(DepLoc, diag::err_omp_unexpected_clause_value)
<< "'source' or 'sink'" << getOpenMPClauseName(OMPC_depend);
return nullptr;
if ((DSAStack->getCurrentDirective() != OMPD_ordered ||
DSAStack->getCurrentDirective() == OMPD_depobj) &&
(DepKind == OMPC_DEPEND_unknown || DepKind == OMPC_DEPEND_source ||
DepKind == OMPC_DEPEND_sink ||
((LangOpts.OpenMP < 50 ||
DSAStack->getCurrentDirective() == OMPD_depobj) &&
DepKind == OMPC_DEPEND_depobj))) {
SmallVector<unsigned, 3> Except;
if (LangOpts.OpenMP < 50 || DSAStack->getCurrentDirective() == OMPD_depobj)
std::string Expected = (LangOpts.OpenMP >= 50 && !DepModifier)
? "depend modifier(iterator) or "
: "";
Diag(DepLoc, diag::err_omp_unexpected_clause_value)
<< Expected + getListOfPossibleValues(OMPC_depend, /*First=*/0,
<< getOpenMPClauseName(OMPC_depend);
return nullptr;
if (DepModifier &&
(DepKind == OMPC_DEPEND_source || DepKind == OMPC_DEPEND_sink)) {
return nullptr;
if (DepModifier &&
Diag(DepModifier->getExprLoc(), diag::err_omp_depend_modifier_not_iterator);
SmallVector<Expr *, 8> Vars;
DSAStackTy::OperatorOffsetTy OpsOffs;
llvm::APSInt DepCounter(/*BitWidth=*/32);
llvm::APSInt TotalDepCount(/*BitWidth=*/32);
if (DepKind == OMPC_DEPEND_sink || DepKind == OMPC_DEPEND_source) {
if (const Expr *OrderedCountExpr =
DSAStack->getParentOrderedRegionParam().first) {
TotalDepCount = OrderedCountExpr->EvaluateKnownConstInt(Context);
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP shared clause.");
if (isa<DependentScopeDeclRefExpr>(RefExpr)) {
// It will be analyzed later.
SourceLocation ELoc = RefExpr->getExprLoc();
Expr *SimpleExpr = RefExpr->IgnoreParenCasts();
if (DepKind == OMPC_DEPEND_sink) {
if (DSAStack->getParentOrderedRegionParam().first &&
DepCounter >= TotalDepCount) {
Diag(ELoc, diag::err_omp_depend_sink_unexpected_expr);
// OpenMP [2.13.9, Summary]
// depend(dependence-type : vec), where dependence-type is:
// 'sink' and where vec is the iteration vector, which has the form:
// x1 [+- d1], x2 [+- d2 ], . . . , xn [+- dn]
// where n is the value specified by the ordered clause in the loop
// directive, xi denotes the loop iteration variable of the i-th nested
// loop associated with the loop directive, and di is a constant
// non-negative integer.
if (CurContext->isDependentContext()) {
// It will be analyzed later.
SimpleExpr = SimpleExpr->IgnoreImplicit();
OverloadedOperatorKind OOK = OO_None;
SourceLocation OOLoc;
Expr *LHS = SimpleExpr;
Expr *RHS = nullptr;
if (auto *BO = dyn_cast<BinaryOperator>(SimpleExpr)) {
OOK = BinaryOperator::getOverloadedOperator(BO->getOpcode());
OOLoc = BO->getOperatorLoc();
LHS = BO->getLHS()->IgnoreParenImpCasts();
RHS = BO->getRHS()->IgnoreParenImpCasts();
} else if (auto *OCE = dyn_cast<CXXOperatorCallExpr>(SimpleExpr)) {
OOK = OCE->getOperator();
OOLoc = OCE->getOperatorLoc();
LHS = OCE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
RHS = OCE->getArg(/*Arg=*/1)->IgnoreParenImpCasts();
} else if (auto *MCE = dyn_cast<CXXMemberCallExpr>(SimpleExpr)) {
OOK = MCE->getMethodDecl()
OOLoc = MCE->getCallee()->getExprLoc();
LHS = MCE->getImplicitObjectArgument()->IgnoreParenImpCasts();
RHS = MCE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
SourceLocation ELoc;
SourceRange ERange;
auto Res = getPrivateItem(*this, LHS, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
if (OOK != OO_Plus && OOK != OO_Minus && (RHS || OOK != OO_None)) {
Diag(OOLoc, diag::err_omp_depend_sink_expected_plus_minus);
if (RHS) {
ExprResult RHSRes = VerifyPositiveIntegerConstantInClause(
RHS, OMPC_depend, /*StrictlyPositive=*/false);
if (RHSRes.isInvalid())
if (!CurContext->isDependentContext() &&
DSAStack->getParentOrderedRegionParam().first &&
DepCounter != DSAStack->isParentLoopControlVariable(D).first) {
const ValueDecl *VD =
if (VD)
Diag(ELoc, diag::err_omp_depend_sink_expected_loop_iteration)
<< 1 << VD;
Diag(ELoc, diag::err_omp_depend_sink_expected_loop_iteration) << 0;
OpsOffs.emplace_back(RHS, OOK);
} else {
bool OMPDependTFound = LangOpts.OpenMP >= 50;
if (OMPDependTFound)
OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack,
DepKind == OMPC_DEPEND_depobj);
if (DepKind == OMPC_DEPEND_depobj) {
// OpenMP 5.0, 2.17.11 depend Clause, Restrictions, C/C++
// List items used in depend clauses with the depobj dependence type
// must be expressions of the omp_depend_t type.
if (!RefExpr->isValueDependent() && !RefExpr->isTypeDependent() &&
!RefExpr->isInstantiationDependent() &&
!RefExpr->containsUnexpandedParameterPack() &&
(OMPDependTFound &&
RefExpr->getType()))) {
Diag(ELoc, diag::err_omp_expected_omp_depend_t_lvalue)
<< 0 << RefExpr->getType() << RefExpr->getSourceRange();
if (!RefExpr->isLValue()) {
Diag(ELoc, diag::err_omp_expected_omp_depend_t_lvalue)
<< 1 << RefExpr->getType() << RefExpr->getSourceRange();
} else {
// OpenMP 5.0 [2.17.11, Restrictions]
// List items used in depend clauses cannot be zero-length array
// sections.
QualType ExprTy = RefExpr->getType().getNonReferenceType();
const auto *OASE = dyn_cast<OMPArraySectionExpr>(SimpleExpr);
if (OASE) {
QualType BaseType =
if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
ExprTy = ATy->getElementType();
ExprTy = BaseType->getPointeeType();
ExprTy = ExprTy.getNonReferenceType();
const Expr *Length = OASE->getLength();
Expr::EvalResult Result;
if (Length && !Length->isValueDependent() &&
Length->EvaluateAsInt(Result, Context) &&
Result.Val.getInt().isNullValue()) {
<< SimpleExpr->getSourceRange();
// OpenMP 5.0, 2.17.11 depend Clause, Restrictions, C/C++
// List items used in depend clauses with the in, out, inout or
// mutexinoutset dependence types cannot be expressions of the
// omp_depend_t type.
if (!RefExpr->isValueDependent() && !RefExpr->isTypeDependent() &&
!RefExpr->isInstantiationDependent() &&
!RefExpr->containsUnexpandedParameterPack() &&
(OMPDependTFound &&
DSAStack->getOMPDependT().getTypePtr() == ExprTy.getTypePtr())) {
Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
<< (LangOpts.OpenMP >= 50 ? 1 : 0) << 1
<< RefExpr->getSourceRange();
auto *ASE = dyn_cast<ArraySubscriptExpr>(SimpleExpr);
if (!RefExpr->IgnoreParenImpCasts()->isLValue() ||
(ASE && !ASE->getBase()->isTypeDependent() &&
->isPointerType() &&
!ASE->getBase()->getType().getNonReferenceType()->isArrayType())) {
Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
<< (LangOpts.OpenMP >= 50 ? 1 : 0)
<< (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange();
ExprResult Res;
Sema::TentativeAnalysisScope Trap(*this);
Res = CreateBuiltinUnaryOp(ELoc, UO_AddrOf,
if (!Res.isUsable() && !isa<OMPArraySectionExpr>(SimpleExpr) &&
!isa<OMPArrayShapingExpr>(SimpleExpr)) {
Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
<< (LangOpts.OpenMP >= 50 ? 1 : 0)
<< (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange();
if (!CurContext->isDependentContext() && DepKind == OMPC_DEPEND_sink &&
TotalDepCount > VarList.size() &&
DSAStack->getParentOrderedRegionParam().first &&
DSAStack->getParentLoopControlVariable(VarList.size() + 1)) {
Diag(EndLoc, diag::err_omp_depend_sink_expected_loop_iteration)
<< 1 << DSAStack->getParentLoopControlVariable(VarList.size() + 1);
if (DepKind != OMPC_DEPEND_source && DepKind != OMPC_DEPEND_sink &&
return nullptr;
auto *C = OMPDependClause::Create(Context, StartLoc, LParenLoc, EndLoc,
DepModifier, DepKind, DepLoc, ColonLoc,
Vars, TotalDepCount.getZExtValue());
if ((DepKind == OMPC_DEPEND_sink || DepKind == OMPC_DEPEND_source) &&
DSAStack->addDoacrossDependClause(C, OpsOffs);
return C;
OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier,
Expr *Device, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation ModifierLoc,
SourceLocation EndLoc) {
assert((ModifierLoc.isInvalid() || LangOpts.OpenMP >= 50) &&
"Unexpected device modifier in OpenMP < 50.");
bool ErrorFound = false;
if (ModifierLoc.isValid() && Modifier == OMPC_DEVICE_unknown) {
std::string Values =
getListOfPossibleValues(OMPC_device, /*First=*/0, OMPC_DEVICE_unknown);
Diag(ModifierLoc, diag::err_omp_unexpected_clause_value)
<< Values << getOpenMPClauseName(OMPC_device);
ErrorFound = true;
Expr *ValExpr = Device;
Stmt *HelperValStmt = nullptr;
// OpenMP [2.9.1, Restrictions]
// The device expression must evaluate to a non-negative integer value.
ErrorFound = !isNonNegativeIntegerValue(ValExpr, *this, OMPC_device,
/*StrictlyPositive=*/false) ||
if (ErrorFound)
return nullptr;
OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
OpenMPDirectiveKind CaptureRegion =
getOpenMPCaptureRegionForClause(DKind, OMPC_device, LangOpts.OpenMP);
if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
ValExpr = MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
HelperValStmt = buildPreInits(Context, Captures);
return new (Context)
OMPDeviceClause(Modifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc,
LParenLoc, ModifierLoc, EndLoc);
static bool checkTypeMappable(SourceLocation SL, SourceRange SR, Sema &SemaRef,
DSAStackTy *Stack, QualType QTy,
bool FullCheck = true) {
NamedDecl *ND;
if (QTy->isIncompleteType(&ND)) {
SemaRef.Diag(SL, diag::err_incomplete_type) << QTy << SR;
return false;
if (FullCheck && !SemaRef.CurContext->isDependentContext() &&
SemaRef.Diag(SL, diag::warn_omp_non_trivial_type_mapped) << QTy << SR;
return true;
/// Return true if it can be proven that the provided array expression
/// (array section or array subscript) does NOT specify the whole size of the
/// array whose base type is \a BaseQTy.
static bool checkArrayExpressionDoesNotReferToWholeSize(Sema &SemaRef,
const Expr *E,
QualType BaseQTy) {
const auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
// If this is an array subscript, it refers to the whole size if the size of
// the dimension is constant and equals 1. Also, an array section assumes the
// format of an array subscript if no colon is used.
if (isa<ArraySubscriptExpr>(E) ||
(OASE && OASE->getColonLocFirst().isInvalid())) {
if (const auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
return ATy->getSize().getSExtValue() != 1;
// Size can't be evaluated statically.
return false;
assert(OASE && "Expecting array section if not an array subscript.");
const Expr *LowerBound = OASE->getLowerBound();
const Expr *Length = OASE->getLength();
// If there is a lower bound that does not evaluates to zero, we are not
// covering the whole dimension.
if (LowerBound) {
Expr::EvalResult Result;
if (!LowerBound->EvaluateAsInt(Result, SemaRef.getASTContext()))
return false; // Can't get the integer value as a constant.
llvm::APSInt ConstLowerBound = Result.Val.getInt();
if (ConstLowerBound.getSExtValue())
return true;
// If we don't have a length we covering the whole dimension.
if (!Length)
return false;
// If the base is a pointer, we don't have a way to get the size of the
// pointee.
if (BaseQTy->isPointerType())
return false;
// We can only check if the length is the same as the size of the dimension
// if we have a constant array.
const auto *CATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr());
if (!CATy)
return false;
Expr::EvalResult Result;
if (!Length->EvaluateAsInt(Result, SemaRef.getASTContext()))
return false; // Can't get the integer value as a constant.
llvm::APSInt ConstLength = Result.Val.getInt();
return CATy->getSize().getSExtValue() != ConstLength.getSExtValue();
// Return true if it can be proven that the provided array expression (array
// section or array subscript) does NOT specify a single element of the array
// whose base type is \a BaseQTy.
static bool checkArrayExpressionDoesNotReferToUnitySize(Sema &SemaRef,
const Expr *E,
QualType BaseQTy) {
const auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
// An array subscript always refer to a single element. Also, an array section
// assumes the format of an array subscript if no colon is used.
if (isa<ArraySubscriptExpr>(E) ||
(OASE && OASE->getColonLocFirst().isInvalid()))
return false;
assert(OASE && "Expecting array section if not an array subscript.");
const Expr *Length = OASE->getLength();
// If we don't have a length we have to check if the array has unitary size
// for this dimension. Also, we should always expect a length if the base type
// is pointer.
if (!Length) {
if (const auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
return ATy->getSize().getSExtValue() != 1;
// We cannot assume anything.
return false;
// Check if the length evaluates to 1.
Expr::EvalResult Result;
if (!Length->EvaluateAsInt(Result, SemaRef.getASTContext()))
return false; // Can't get the integer value as a constant.
llvm::APSInt ConstLength = Result.Val.getInt();
return ConstLength.getSExtValue() != 1;
// The base of elements of list in a map clause have to be either:
// - a reference to variable or field.
// - a member expression.
// - an array expression.
// E.g. if we have the expression 'r.S.Arr[:12]', we want to retrieve the
// reference to 'r'.
// If we have:
// struct SS {
// Bla S;
// foo() {
// #pragma omp target map (S.Arr[:12]);
// }
// }
// We want to retrieve the member expression 'this->S';
// OpenMP 5.0 [, map Clause, Restrictions, p.2]
// If a list item is an array section, it must specify contiguous storage.
// For this restriction it is sufficient that we make sure only references
// to variables or fields and array expressions, and that no array sections
// exist except in the rightmost expression (unless they cover the whole
// dimension of the array). E.g. these would be invalid:
// r.ArrS[3:5].Arr[6:7]
// r.ArrS[3:5].x
// but these would be valid:
// r.ArrS[3].Arr[6:7]
// r.ArrS[3].x
namespace {
class MapBaseChecker final : public StmtVisitor<MapBaseChecker, bool> {
Sema &SemaRef;
OpenMPClauseKind CKind = OMPC_unknown;
OMPClauseMappableExprCommon::MappableExprComponentList &Components;
bool NoDiagnose = false;
const Expr *RelevantExpr = nullptr;
bool AllowUnitySizeArraySection = true;
bool AllowWholeSizeArraySection = true;
SourceLocation ELoc;
SourceRange ERange;
void emitErrorMsg() {
// If nothing else worked, this is not a valid map clause expression.
if (SemaRef.getLangOpts().OpenMP < 50) {
<< ERange;
} else {
SemaRef.Diag(ELoc, diag::err_omp_non_lvalue_in_map_or_motion_clauses)
<< getOpenMPClauseName(CKind) << ERange;
bool VisitDeclRefExpr(DeclRefExpr *DRE) {
if (!isa<VarDecl>(DRE->getDecl())) {
return false;
assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
RelevantExpr = DRE;
// Record the component.
Components.emplace_back(DRE, DRE->getDecl());
return true;
bool VisitMemberExpr(MemberExpr *ME) {
Expr *E = ME;
Expr *BaseE = ME->getBase()->IgnoreParenCasts();
if (isa<CXXThisExpr>(BaseE)) {
assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
// We found a base expression: this->Val.
RelevantExpr = ME;
} else {
E = BaseE;
if (!isa<FieldDecl>(ME->getMemberDecl())) {
if (!NoDiagnose) {
SemaRef.Diag(ELoc, diag::err_omp_expected_access_to_data_field)
<< ME->getSourceRange();
return false;
if (RelevantExpr)
return false;
return Visit(E);
auto *FD = cast<FieldDecl>(ME->getMemberDecl());
// OpenMP 4.5 [, map Clause, Restrictions, C/C++, p.3]
// A bit-field cannot appear in a map clause.
if (FD->isBitField()) {
if (!NoDiagnose) {
SemaRef.Diag(ELoc, diag::err_omp_bit_fields_forbidden_in_clause)
<< ME->getSourceRange() << getOpenMPClauseName(CKind);
return false;
if (RelevantExpr)
return false;
return Visit(E);
// OpenMP 4.5 [, map Clause, Restrictions, C++, p.1]
// If the type of a list item is a reference to a type T then the type
// will be considered to be T for all purposes of this clause.
QualType CurType = BaseE->getType().getNonReferenceType();
// OpenMP 4.5 [, map Clause, Restrictions, C/C++, p.2]
// A list item cannot be a variable that is a member of a structure with
// a union type.
if (CurType->isUnionType()) {
if (!NoDiagnose) {
SemaRef.Diag(ELoc, diag::err_omp_union_type_not_allowed)
<< ME->getSourceRange();
return false;
return RelevantExpr || Visit(E);
// If we got a member expression, we should not expect any array section
// before that:
// OpenMP 4.5 [, map Clause, Restrictions, p.7]
// If a list item is an element of a structure, only the rightmost symbol
// of the variable reference can be an array section.
AllowUnitySizeArraySection = false;
AllowWholeSizeArraySection = false;
// Record the component.
Components.emplace_back(ME, FD);
return RelevantExpr || Visit(E);
bool VisitArraySubscriptExpr(ArraySubscriptExpr *AE) {
Expr *E = AE->getBase()->IgnoreParenImpCasts();
if (!E->getType()->isAnyPointerType() && !E->getType()->isArrayType()) {
if (!NoDiagnose) {
SemaRef.Diag(ELoc, diag::err_omp_expected_base_var_name)
<< 0 << AE->getSourceRange();
return false;
return RelevantExpr || Visit(E);
// If we got an array subscript that express the whole dimension we
// can have any array expressions before. If it only expressing part of
// the dimension, we can only have unitary-size array expressions.
if (checkArrayExpressionDoesNotReferToWholeSize(SemaRef, AE,
AllowWholeSizeArraySection = false;
if (const auto *TE = dyn_cast<CXXThisExpr>(E->IgnoreParenCasts())) {
Expr::EvalResult Result;
if (!AE->getIdx()->isValueDependent() &&
AE->getIdx()->EvaluateAsInt(Result, SemaRef.getASTContext()) &&
!Result.Val.getInt().isNullValue()) {
assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
RelevantExpr = TE;
// Record the component - we don't have any declaration associated.
Components.emplace_back(AE, nullptr);
return RelevantExpr || Visit(E);
bool VisitOMPArraySectionExpr(OMPArraySectionExpr *OASE) {
assert(!NoDiagnose && "Array sections cannot be implicitly mapped.");
Expr *E = OASE->getBase()->IgnoreParenImpCasts();
QualType CurType =
// OpenMP 4.5 [, map Clause, Restrictions, C++, p.1]
// If the type of a list item is a reference to a type T then the type
// will be considered to be T for all purposes of this clause.
if (CurType->isReferenceType())
CurType = CurType->getPointeeType();
bool IsPointer = CurType->isAnyPointerType();
if (!IsPointer && !CurType->isArrayType()) {
SemaRef.Diag(ELoc, diag::err_omp_expected_base_var_name)
<< 0 << OASE->getSourceRange();
return false;
bool NotWhole =
checkArrayExpressionDoesNotReferToWholeSize(SemaRef, OASE, CurType);
bool NotUnity =
checkArrayExpressionDoesNotReferToUnitySize(SemaRef, OASE, CurType);
if (AllowWholeSizeArraySection) {
// Any array section is currently allowed. Allowing a whole size array
// section implies allowing a unity array section as well.
// If this array section refers to the whole dimension we can still
// accept other array sections before this one, except if the base is a
// pointer. Otherwise, only unitary sections are accepted.
if (NotWhole || IsPointer)
AllowWholeSizeArraySection = false;
} else if (AllowUnitySizeArraySection && NotUnity) {
// A unity or whole array section is not allowed and that is not
// compatible with the properties of the current array section.
ELoc, diag::err_array_section_does_not_specify_contiguous_storage)
<< OASE->getSourceRange();
return false;
if (const auto *TE = dyn_cast<CXXThisExpr>(E)) {
Expr::EvalResult ResultR;
Expr::EvalResult ResultL;
if (!OASE->getLength()->isValueDependent() &&
OASE->getLength()->EvaluateAsInt(ResultR, SemaRef.getASTContext()) &&
!ResultR.Val.getInt().isOneValue()) {
if (OASE->getLowerBound() && !OASE->getLowerBound()->isValueDependent() &&
SemaRef.getASTContext()) &&
!ResultL.Val.getInt().isNullValue()) {
assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
RelevantExpr = TE;
// Record the component - we don't have any declaration associated.
Components.emplace_back(OASE, nullptr);
return RelevantExpr || Visit(E);
bool VisitOMPArrayShapingExpr(OMPArrayShapingExpr *E) {
Expr *Base = E->getBase();
// Record the component - we don't have any declaration associated.
Components.emplace_back(E, nullptr);
return Visit(Base->IgnoreParenImpCasts());
bool VisitUnaryOperator(UnaryOperator *UO) {
if (SemaRef.getLangOpts().OpenMP < 50 || !UO->isLValue() ||
UO->getOpcode() != UO_Deref) {
return false;
if (!RelevantExpr) {
// Record the component if haven't found base decl.
Components.emplace_back(UO, nullptr);
return RelevantExpr || Visit(UO->getSubExpr()->IgnoreParenImpCasts());
bool VisitBinaryOperator(BinaryOperator *BO) {
if (SemaRef.getLangOpts().OpenMP < 50 || !BO->getType()->isPointerType()) {
return false;
// Pointer arithmetic is the only thing we expect to happen here so after we
// make sure the binary operator is a pointer type, the we only thing need
// to to is to visit the subtree that has the same type as root (so that we
// know the other subtree is just an offset)
Expr *LE = BO->getLHS()->IgnoreParenImpCasts();
Expr *RE = BO->getRHS()->IgnoreParenImpCasts();
Components.emplace_back(BO, nullptr);
assert((LE->getType().getTypePtr() == BO->getType().getTypePtr() ||
RE->getType().getTypePtr() == BO->getType().getTypePtr()) &&
"Either LHS or RHS have base decl inside");
if (BO->getType().getTypePtr() == LE->getType().getTypePtr())
return RelevantExpr || Visit(LE);
return RelevantExpr || Visit(RE);
bool VisitCXXThisExpr(CXXThisExpr *CTE) {
assert(!RelevantExpr && "RelevantExpr is expected to be nullptr");
RelevantExpr = CTE;
Components.emplace_back(CTE, nullptr);
return true;
bool VisitStmt(Stmt *) {
return false;
const Expr *getFoundBase() const {
return RelevantExpr;
explicit MapBaseChecker(
Sema &SemaRef, OpenMPClauseKind CKind,
OMPClauseMappableExprCommon::MappableExprComponentList &Components,
bool NoDiagnose, SourceLocation &ELoc, SourceRange &ERange)
: SemaRef(SemaRef), CKind(CKind), Components(Components),
NoDiagnose(NoDiagnose), ELoc(ELoc), ERange(ERange) {}
} // namespace
/// Return the expression of the base of the mappable expression or null if it
/// cannot be determined and do all the necessary checks to see if the expression
/// is valid as a standalone mappable expression. In the process, record all the
/// components of the expression.
static const Expr *checkMapClauseExpressionBase(
Sema &SemaRef, Expr *E,
OMPClauseMappableExprCommon::MappableExprComponentList &CurComponents,
OpenMPClauseKind CKind, bool NoDiagnose) {
SourceLocation ELoc = E->getExprLoc();
SourceRange ERange = E->getSourceRange();
MapBaseChecker Checker(SemaRef, CKind, CurComponents, NoDiagnose, ELoc,
if (Checker.Visit(E->IgnoreParens()))
return Checker.getFoundBase();
return nullptr;
// Return true if expression E associated with value VD has conflicts with other
// map information.
static bool checkMapConflicts(
Sema &SemaRef, DSAStackTy *DSAS, const ValueDecl *VD, const Expr *E,
bool CurrentRegionOnly,
OMPClauseMappableExprCommon::MappableExprComponentListRef CurComponents,
OpenMPClauseKind CKind) {
assert(VD && E);
SourceLocation ELoc = E->getExprLoc();
SourceRange ERange = E->getSourceRange();
// In order to easily check the conflicts we need to match each component of
// the expression under test with the components of the expressions that are
// already in the stack.
assert(!CurComponents.empty() && "Map clause expression with no components!");
assert(CurComponents.back().getAssociatedDeclaration() == VD &&
"Map clause expression with unexpected base!");
// Variables to help detecting enclosing problems in data environment nests.
bool IsEnclosedByDataEnvironmentExpr = false;
const Expr *EnclosingExpr = nullptr;
bool FoundError = DSAS->checkMappableExprComponentListsForDecl(
VD, CurrentRegionOnly,
[&IsEnclosedByDataEnvironmentExpr, &SemaRef, VD, CurrentRegionOnly, ELoc,
ERange, CKind, &EnclosingExpr,
OpenMPClauseKind) {
assert(!StackComponents.empty() &&
"Map clause expression with no components!");
assert(StackComponents.back().getAssociatedDeclaration() == VD &&
"Map clause expression with unexpected base!");
// The whole expression in the stack.
const Expr *RE = StackComponents.front().getAssociatedExpression();
// Expressions must start from the same base. Here we detect at which
// point both expressions diverge from each other and see if we can
// detect if the memory referred to both expressions is contiguous and
// do not overlap.
auto CI = CurComponents.rbegin();
auto CE = CurComponents.rend();
auto SI = StackComponents.rbegin();
auto SE = StackComponents.rend();
for (; CI != CE && SI != SE; ++CI, ++SI) {
// OpenMP 4.5 [, map Clause, Restrictions, p.3]
// At most one list item can be an array item derived from a given
// variable in map clauses of the same construct.
if (CurrentRegionOnly &&
(isa<ArraySubscriptExpr>(CI->getAssociatedExpression()) ||
isa<OMPArraySectionExpr>(CI->getAssociatedExpression()) ||
isa<OMPArrayShapingExpr>(CI->getAssociatedExpression())) &&
(isa<ArraySubscriptExpr>(SI->getAssociatedExpression()) ||
isa<OMPArraySectionExpr>(SI->getAssociatedExpression()) ||
isa<OMPArrayShapingExpr>(SI->getAssociatedExpression()))) {
<< CI->getAssociatedExpression()->getSourceRange();
<< SI->getAssociatedExpression()->getSourceRange();
return true;
// Do both expressions have the same kind?
if (CI->getAssociatedExpression()->getStmtClass() !=
// Are we dealing with different variables/fields?
if (CI->getAssociatedDeclaration() != SI->getAssociatedDeclaration())
// Check if the extra components of the expressions in the enclosing
// data environment are redundant for the current base declaration.
// If they are, the maps completely overlap, which is legal.
for (; SI != SE; ++SI) {
QualType Type;
if (const auto *ASE =
dyn_cast<ArraySubscriptExpr>(SI->getAssociatedExpression())) {
Type = ASE->getBase()->IgnoreParenImpCasts()->getType();
} else if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(
SI->getAssociatedExpression())) {
const Expr *E = OASE->getBase()->IgnoreParenImpCasts();
Type =
} else if (const auto *OASE = dyn_cast<OMPArrayShapingExpr>(
SI->getAssociatedExpression())) {
Type = OASE->getBase()->getType()->getPointeeType();
if (Type.isNull() || Type->isAnyPointerType() ||
SemaRef, SI->getAssociatedExpression(), Type))
// OpenMP 4.5 [, map Clause, Restrictions, p.4]
// List items of map clauses in the same construct must not share
// original storage.
// If the expressions are exactly the same or one is a subset of the
// other, it means they are sharing storage.
if (CI == CE && SI == SE) {
if (CurrentRegionOnly) {
if (CKind == OMPC_map) {
SemaRef.Diag(ELoc, diag::err_omp_map_shared_storage) << ERange;
} else {
assert(CKind == OMPC_to || CKind == OMPC_from);
SemaRef.Diag(ELoc, diag::err_omp_once_referenced_in_target_update)
<< ERange;
SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
<< RE->getSourceRange();
return true;
// If we find the same expression in the enclosing data environment,
// that is legal.
IsEnclosedByDataEnvironmentExpr = true;
return false;
QualType DerivedType =
SourceLocation DerivedLoc =
// OpenMP 4.5 [, map Clause, Restrictions, C++, p.1]
// If the type of a list item is a reference to a type T then the type
// will be considered to be T for all purposes of this clause.
DerivedType = DerivedType.getNonReferenceType();
// OpenMP 4.5 [, map Clause, Restrictions, C/C++, p.1]
// A variable for which the type is pointer and an array section
// derived from that variable must not appear as list items of map
// clauses of the same construct.
// Also, cover one of the cases in:
// OpenMP 4.5 [, map Clause, Restrictions, p.5]
// If any part of the original storage of a list item has corresponding
// storage in the device data environment, all of the original storage
// must have corresponding storage in the device data environment.
if (DerivedType->isAnyPointerType()) {
if (CI == CE || SI == SE) {
<< DerivedLoc;
SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
<< RE->getSourceRange();
return true;
if (CI->getAssociatedExpression()->getStmtClass() !=
SI->getAssociatedExpression()->getStmtClass() ||
CI->getAssociatedDeclaration()->getCanonicalDecl() ==
SI->getAssociatedDeclaration()->getCanonicalDecl()) {
assert(CI != CE && SI != SE);
SemaRef.Diag(DerivedLoc, diag::err_omp_same_pointer_dereferenced)
<< DerivedLoc;
SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
<< RE->getSourceRange();
return true;
// OpenMP 4.5 [, map Clause, Restrictions, p.4]
// List items of map clauses in the same construct must not share
// original storage.
// An expression is a subset of the other.
if (CurrentRegionOnly && (CI == CE || SI == SE)) {
if (CKind == OMPC_map) {
if (CI != CE || SI != SE) {
// Allow constructs like this: map(s, s.ptr[0:1]), where s.ptr is
// a pointer.
auto Begin =
CI != CE ? CurComponents.begin() : StackComponents.begin();
auto End = CI != CE ? CurComponents.end() : StackComponents.end();
auto It = Begin;
while (It != End && !It->getAssociatedDeclaration())
std::advance(It, 1);
assert(It != End &&
"Expected at least one component with the declaration.");
if (It != Begin && It->getAssociatedDeclaration()
->isAnyPointerType()) {
IsEnclosedByDataEnvironmentExpr = false;
EnclosingExpr = nullptr;
return false;
SemaRef.Diag(ELoc, diag::err_omp_map_shared_storage) << ERange;
} else {
assert(CKind == OMPC_to || CKind == OMPC_from);
SemaRef.Diag(ELoc, diag::err_omp_once_referenced_in_target_update)
<< ERange;
SemaRef.Diag(RE->getExprLoc(), diag::note_used_here)
<< RE->getSourceRange();
return true;
// The current expression uses the same base as other expression in the
// data environment but does not contain it completely.
if (!CurrentRegionOnly && SI != SE)
EnclosingExpr = RE;
// The current expression is a subset of the expression in the data
// environment.
IsEnclosedByDataEnvironmentExpr |=
(!CurrentRegionOnly && CI != CE && SI == SE);
return false;
if (CurrentRegionOnly)
return FoundError;
// OpenMP 4.5 [, map Clause, Restrictions, p.5]
// If any part of the original storage of a list item has corresponding
// storage in the device data environment, all of the original storage must
// have corresponding storage in the device data environment.
// OpenMP 4.5 [, map Clause, Restrictions, p.6]
// If a list item is an element of a structure, and a different element of
// the structure has a corresponding list item in the device data environment
// prior to a task encountering the construct associated with the map clause,
// then the list item must also have a corresponding list item in the device
// data environment prior to the task encountering the construct.
if (EnclosingExpr && !IsEnclosedByDataEnvironmentExpr) {
<< ERange;
SemaRef.Diag(EnclosingExpr->getExprLoc(), diag::note_used_here)
<< EnclosingExpr->getSourceRange();
return true;
return FoundError;
// Look up the user-defined mapper given the mapper name and mapped type, and
// build a reference to it.
static ExprResult buildUserDefinedMapperRef(Sema &SemaRef, Scope *S,
CXXScopeSpec &MapperIdScopeSpec,
const DeclarationNameInfo &MapperId,
QualType Type,
Expr *UnresolvedMapper) {
if (MapperIdScopeSpec.isInvalid())
return ExprError();
// Get the actual type for the array type.
if (Type->isArrayType()) {
assert(Type->getAsArrayTypeUnsafe() && "Expect to get a valid array type");
Type = Type->getAsArrayTypeUnsafe()->getElementType().getCanonicalType();
// Find all user-defined mappers with the given MapperId.
SmallVector<UnresolvedSet<8>, 4> Lookups;
LookupResult Lookup(SemaRef, MapperId, Sema::LookupOMPMapperName);
if (S) {
while (S && SemaRef.LookupParsedName(Lookup, S, &MapperIdScopeSpec)) {
NamedDecl *D = Lookup.getRepresentativeDecl();
while (S && !S->isDeclScope(D))
S = S->getParent();
if (S)
S = S->getParent();
Lookups.back().append(Lookup.begin(), Lookup.end());
} else if (auto *ULE = cast_or_null<UnresolvedLookupExpr>(UnresolvedMapper)) {
// Extract the user-defined mappers with the given MapperId.
for (NamedDecl *D : ULE->decls()) {
auto *DMD = cast<OMPDeclareMapperDecl>(D);
assert(DMD && "Expect valid OMPDeclareMapperDecl during instantiation.");
// Defer the lookup for dependent types. The results will be passed through
// UnresolvedMapper on instantiation.
if (SemaRef.CurContext->isDependentContext() || Type->isDependentType() ||
Type->isInstantiationDependentType() ||
Type->containsUnexpandedParameterPack() ||
filterLookupForUDReductionAndMapper<bool>(Lookups, [](ValueDecl *D) {
return !D->isInvalidDecl() &&
(D->getType()->isDependentType() ||
D->getType()->isInstantiationDependentType() ||
})) {
UnresolvedSet<8> URS;
for (const UnresolvedSet<8> &Set : Lookups) {
if (Set.empty())
URS.append(Set.begin(), Set.end());
return UnresolvedLookupExpr::Create(
SemaRef.Context, /*NamingClass=*/nullptr,
MapperIdScopeSpec.getWithLocInContext(SemaRef.Context), MapperId,
/*ADL=*/false, /*Overloaded=*/true, URS.begin(), URS.end());
SourceLocation Loc = MapperId.getLoc();
// [OpenMP 5.0], declare mapper Directive, Restrictions
// The type must be of struct, union or class type in C and C++
if (!Type->isStructureOrClassType() && !Type->isUnionType() &&
(MapperIdScopeSpec.isSet() || MapperId.getAsString() != "default")) {
SemaRef.Diag(Loc, diag::err_omp_mapper_wrong_type);
return ExprError();
// Perform argument dependent lookup.
if (SemaRef.getLangOpts().CPlusPlus && !MapperIdScopeSpec.isSet())
argumentDependentLookup(SemaRef, MapperId, Loc, Type, Lookups);
// Return the first user-defined mapper with the desired type.
if (auto *VD = filterLookupForUDReductionAndMapper<ValueDecl *>(
Lookups, [&SemaRef, Type](ValueDecl *D) -> ValueDecl * {
if (!D->isInvalidDecl() &&
SemaRef.Context.hasSameType(D->getType(), Type))
return D;
return nullptr;
return SemaRef.BuildDeclRefExpr(VD, Type, VK_LValue, Loc);
// Find the first user-defined mapper with a type derived from the desired
// type.
if (auto *VD = filterLookupForUDReductionAndMapper<ValueDecl *>(
Lookups, [&SemaRef, Type, Loc](ValueDecl *D) -> ValueDecl * {
if (!D->isInvalidDecl() &&
SemaRef.IsDerivedFrom(Loc, Type, D->getType()) &&
return D;
return nullptr;
})) {
CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
if (SemaRef.IsDerivedFrom(Loc, Type, VD->getType(), Paths)) {
if (!Paths.isAmbiguous(SemaRef.Context.getCanonicalType(
VD->getType().getUnqualifiedType()))) {
if (SemaRef.CheckBaseClassAccess(
Loc, VD->getType(), Type, Paths.front(),
/*DiagID=*/0) != Sema::AR_inaccessible) {
return SemaRef.BuildDeclRefExpr(VD, Type, VK_LValue, Loc);
// Report error if a mapper is specified, but cannot be found.
if (MapperIdScopeSpec.isSet() || MapperId.getAsString() != "default") {
SemaRef.Diag(Loc, diag::err_omp_invalid_mapper)
<< Type << MapperId.getName();
return ExprError();
return ExprEmpty();
namespace {
// Utility struct that gathers all the related lists associated with a mappable
// expression.
struct MappableVarListInfo {
// The list of expressions.
ArrayRef<Expr *> VarList;
// The list of processed expressions.
SmallVector<Expr *, 16> ProcessedVarList;
// The mappble components for each expression.
OMPClauseMappableExprCommon::MappableExprComponentLists VarComponents;
// The base declaration of the variable.
SmallVector<ValueDecl *, 16> VarBaseDeclarations;
// The reference to the user-defined mapper associated with every expression.
SmallVector<Expr *, 16> UDMapperList;
MappableVarListInfo(ArrayRef<Expr *> VarList) : VarList(VarList) {
// We have a list of components and base declarations for each entry in the
// variable list.
// Check the validity of the provided variable list for the provided clause kind
// \a CKind. In the check process the valid expressions, mappable expression
// components, variables, and user-defined mappers are extracted and used to
// fill \a ProcessedVarList, \a VarComponents, \a VarBaseDeclarations, and \a
// UDMapperList in MVLI. \a MapType, \a IsMapTypeImplicit, \a MapperIdScopeSpec,
// and \a MapperId are expected to be valid if the clause kind is 'map'.
static void checkMappableExpressionList(
Sema &SemaRef, DSAStackTy *DSAS, OpenMPClauseKind CKind,
MappableVarListInfo &MVLI, SourceLocation StartLoc,
CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo MapperId,
ArrayRef<Expr *> UnresolvedMappers,
OpenMPMapClauseKind MapType = OMPC_MAP_unknown,
bool IsMapTypeImplicit = false) {
// We only expect mappable expressions in 'to', 'from', and 'map' clauses.
assert((CKind == OMPC_map || CKind == OMPC_to || CKind == OMPC_from) &&
"Unexpected clause kind with mappable expressions!");
// If the identifier of user-defined mapper is not specified, it is "default".
// We do not change the actual name in this clause to distinguish whether a
// mapper is specified explicitly, i.e., it is not explicitly specified when
// MapperId.getName() is empty.
if (!MapperId.getName() || MapperId.getName().isEmpty()) {
auto &DeclNames = SemaRef.getASTContext().DeclarationNames;
// Iterators to find the current unresolved mapper expression.
auto UMIt = UnresolvedMappers.begin(), UMEnd = UnresolvedMappers.end();
bool UpdateUMIt = false;
Expr *UnresolvedMapper = nullptr;
// Keep track of the mappable components and base declarations in this clause.
// Each entry in the list is going to have a list of components associated. We
// record each set of the components so that we can build the clause later on.
// In the end we should have the same amount of declarations and component
// lists.
for (Expr *RE : MVLI.VarList) {
assert(RE && "Null expr in omp to/from/map clause");
SourceLocation ELoc = RE->getExprLoc();
// Find the current unresolved mapper expression.
if (UpdateUMIt && UMIt != UMEnd) {
UMIt != UMEnd &&
"Expect the size of UnresolvedMappers to match with that of VarList");
UpdateUMIt = true;
if (UMIt != UMEnd)
UnresolvedMapper = *UMIt;
const Expr *VE = RE->IgnoreParenLValueCasts();
if (VE->isValueDependent() || VE->isTypeDependent() ||
VE->isInstantiationDependent() ||
VE->containsUnexpandedParameterPack()) {
// Try to find the associated user-defined mapper.
ExprResult ER = buildUserDefinedMapperRef(
SemaRef, DSAS->getCurScope(), MapperIdScopeSpec, MapperId,
VE->getType().getCanonicalType(), UnresolvedMapper);
if (ER.isInvalid())
// We can only analyze this information once the missing information is
// resolved.
Expr *SimpleExpr = RE->IgnoreParenCasts();
if (!RE->isLValue()) {
if (SemaRef.getLangOpts().OpenMP < 50) {
ELoc, diag::err_omp_expected_named_var_member_or_array_expression)
<< RE->getSourceRange();
} else {
SemaRef.Diag(ELoc, diag::err_omp_non_lvalue_in_map_or_motion_clauses)
<< getOpenMPClauseName(CKind) << RE->getSourceRange();
OMPClauseMappableExprCommon::MappableExprComponentList CurComponents;
ValueDecl *CurDeclaration = nullptr;
// Obtain the array or member expression bases if required. Also, fill the
// components array with all the components identified in the process.
const Expr *BE = checkMapClauseExpressionBase(
SemaRef, SimpleExpr, CurComponents, CKind, /*NoDiagnose=*/false);
if (!BE)
assert(!CurComponents.empty() &&
"Invalid mappable expression information.");
if (const auto *TE = dyn_cast<CXXThisExpr>(BE)) {
// Add store "this" pointer to class in DSAStackTy for future checking
// Try to find the associated user-defined mapper.
ExprResult ER = buildUserDefinedMapperRef(
SemaRef, DSAS->getCurScope(), MapperIdScopeSpec, MapperId,
VE->getType().getCanonicalType(), UnresolvedMapper);
if (ER.isInvalid())
// Skip restriction checking for variable or field declarations
MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
// For the following checks, we rely on the base declaration which is
// expected to be associated with the last component. The declaration is
// expected to be a variable or a field (if 'this' is being mapped).
CurDeclaration = CurComponents.back().getAssociatedDeclaration();
assert(CurDeclaration && "Null decl on map clause.");
CurDeclaration->isCanonicalDecl() &&
"Expecting components to have associated only canonical declarations.");
auto *VD = dyn_cast<VarDecl>(CurDeclaration);
const auto *FD = dyn_cast<FieldDecl>(CurDeclaration);
assert((VD || FD) && "Only variables or fields are expected here!");
// OpenMP 4.5 [, map Clause, Restrictions, p.10]
// threadprivate variables cannot appear in a map clause.
// OpenMP 4.5 [2.10.5, target update Construct]
// threadprivate variables cannot appear in a from clause.
if (VD && DSAS->isThreadPrivate(VD)) {
DSAStackTy::DSAVarData DVar = DSAS->getTopDSA(VD, /*FromParent=*/false);
SemaRef.Diag(ELoc, diag::err_omp_threadprivate_in_clause)
<< getOpenMPClauseName(CKind);
reportOriginalDsa(SemaRef, DSAS, VD, DVar);
// OpenMP 4.5 [, map Clause, Restrictions, p.9]
// A list item cannot appear in both a map clause and a data-sharing
// attribute clause on the same construct.
// Check conflicts with other map clause expressions. We check the conflicts
// with the current construct separately from the enclosing data
// environment, because the restrictions are different. We only have to
// check conflicts across regions for the map clauses.
if (checkMapConflicts(SemaRef, DSAS, CurDeclaration, SimpleExpr,
/*CurrentRegionOnly=*/true, CurComponents, CKind))
if (CKind == OMPC_map &&
checkMapConflicts(SemaRef, DSAS, CurDeclaration, SimpleExpr,
/*CurrentRegionOnly=*/false, CurComponents, CKind))
// OpenMP 4.5 [2.10.5, target update Construct]
// OpenMP 4.5 [, map Clause, Restrictions, C++, p.1]
// If the type of a list item is a reference to a type T then the type will
// be considered to be T for all purposes of this clause.
auto I = llvm::find_if(
[](const OMPClauseMappableExprCommon::MappableComponent &MC) {
return MC.getAssociatedDeclaration();
assert(I != CurComponents.end() && "Null decl on map clause.");
QualType Type;
auto *ASE = dyn_cast<ArraySubscriptExpr>(VE->IgnoreParens());
auto *OASE = dyn_cast<OMPArraySectionExpr>(VE->IgnoreParens());
auto *OAShE = dyn_cast<OMPArrayShapingExpr>(VE->IgnoreParens());
if (ASE) {
Type = ASE->getType().getNonReferenceType();
} else if (OASE) {
QualType BaseType =
if (const auto *ATy = BaseType->getAsArrayTypeUnsafe())
Type = ATy->getElementType();
Type = BaseType->getPointeeType();
Type = Type.getNonReferenceType();
} else if (OAShE) {
Type = OAShE->getBase()->getType()->getPointeeType();
} else {
Type = VE->getType();
// OpenMP 4.5 [2.10.5, target update Construct, Restrictions, p.4]
// A list item in a to or from clause must have a mappable type.
// OpenMP 4.5 [, map Clause, Restrictions, p.9]
// A list item must have a mappable type.
if (!checkTypeMappable(VE->getExprLoc(), VE->getSourceRange(), SemaRef,
DSAS, Type))
Type = I->getAssociatedDeclaration()->getType().getNonReferenceType();
if (CKind == OMPC_map) {
// target enter data
// OpenMP [2.10.2, Restrictions, p. 99]
// A map-type must be specified in all map clauses and must be either
// to or alloc.
OpenMPDirectiveKind DKind = DSAS->getCurrentDirective();
if (DKind == OMPD_target_enter_data &&
!(MapType == OMPC_MAP_to || MapType == OMPC_MAP_alloc)) {
SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
<< (IsMapTypeImplicit ? 1 : 0)
<< getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
<< getOpenMPDirectiveName(DKind);
// target exit_data
// OpenMP [2.10.3, Restrictions, p. 102]
// A map-type must be specified in all map clauses and must be either
// from, release, or delete.
if (DKind == OMPD_target_exit_data &&
!(MapType == OMPC_MAP_from || MapType == OMPC_MAP_release ||
MapType == OMPC_MAP_delete)) {
SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
<< (IsMapTypeImplicit ? 1 : 0)
<< getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
<< getOpenMPDirectiveName(DKind);
// target, target data
// OpenMP 5.0 [2.12.2, Restrictions, p. 163]
// OpenMP 5.0 [2.12.5, Restrictions, p. 174]
// A map-type in a map clause must be to, from, tofrom or alloc
if ((DKind == OMPD_target_data ||
isOpenMPTargetExecutionDirective(DKind)) &&
!(MapType == OMPC_MAP_to || MapType == OMPC_MAP_from ||
MapType == OMPC_MAP_tofrom || MapType == OMPC_MAP_alloc)) {
SemaRef.Diag(StartLoc, diag::err_omp_invalid_map_type_for_directive)
<< (IsMapTypeImplicit ? 1 : 0)
<< getOpenMPSimpleClauseTypeName(OMPC_map, MapType)
<< getOpenMPDirectiveName(DKind);
// OpenMP 4.5 [, Restrictions, p.3]
// A list item cannot appear in both a map clause and a data-sharing
// attribute clause on the same construct
// OpenMP 5.0 [, Restrictions, p.7]
// A list item cannot appear in both a map clause and a data-sharing
// attribute clause on the same construct unless the construct is a
// combined construct.
if (VD && ((SemaRef.LangOpts.OpenMP <= 45 &&
isOpenMPTargetExecutionDirective(DKind)) ||
DKind == OMPD_target)) {
DSAStackTy::DSAVarData DVar = DSAS->getTopDSA(VD, /*FromParent=*/false);
if (isOpenMPPrivate(DVar.CKind)) {
SemaRef.Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_map)
<< getOpenMPDirectiveName(DSAS->getCurrentDirective());
reportOriginalDsa(SemaRef, DSAS, CurDeclaration, DVar);
// Try to find the associated user-defined mapper.
ExprResult ER = buildUserDefinedMapperRef(
SemaRef, DSAS->getCurScope(), MapperIdScopeSpec, MapperId,
Type.getCanonicalType(), UnresolvedMapper);
if (ER.isInvalid())
// Save the current expression.
// Store the components in the stack so that they can be used to check
// against other clauses later on.
DSAS->addMappableExpressionComponents(CurDeclaration, CurComponents,
// Save the components and declaration to create the clause. For purposes of
// the clause creation, any component list that has has base 'this' uses
// null as base declaration.
MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
MVLI.VarBaseDeclarations.push_back(isa<MemberExpr>(BE) ? nullptr
: CurDeclaration);
OMPClause *Sema::ActOnOpenMPMapClause(
ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
ArrayRef<SourceLocation> MapTypeModifiersLoc,
CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
OpenMPMapClauseKind MapType, bool IsMapTypeImplicit, SourceLocation MapLoc,
SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
OpenMPMapModifierKind Modifiers[] = {OMPC_MAP_MODIFIER_unknown,
SourceLocation ModifiersLoc[NumberOfOMPMapClauseModifiers];
// Process map-type-modifiers, flag errors for duplicate modifiers.
unsigned Count = 0;
for (unsigned I = 0, E = MapTypeModifiers.size(); I < E; ++I) {
if (MapTypeModifiers[I] != OMPC_MAP_MODIFIER_unknown &&
llvm::find(Modifiers, MapTypeModifiers[I]) != std::end(Modifiers)) {
Diag(MapTypeModifiersLoc[I], diag::err_omp_duplicate_map_type_modifier);
assert(Count < NumberOfOMPMapClauseModifiers &&
"Modifiers exceed the allowed number of map type modifiers");
Modifiers[Count] = MapTypeModifiers[I];
ModifiersLoc[Count] = MapTypeModifiersLoc[I];
MappableVarListInfo MVLI(VarList);
checkMappableExpressionList(*this, DSAStack, OMPC_map, MVLI, Locs.StartLoc,
MapperIdScopeSpec, MapperId, UnresolvedMappers,
MapType, IsMapTypeImplicit);
// We need to produce a map clause even if we don't have variables so that
// other diagnostics related with non-existing map clauses are accurate.
return OMPMapClause::Create(Context, Locs, MVLI.ProcessedVarList,
MVLI.VarBaseDeclarations, MVLI.VarComponents,
MVLI.UDMapperList, Modifiers, ModifiersLoc,
MapperId, MapType, IsMapTypeImplicit, MapLoc);
QualType Sema::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
TypeResult ParsedType) {
QualType ReductionType = GetTypeFromParser(ParsedType.get());
if (ReductionType.isNull())
return QualType();
// [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions, C\C++
// A type name in a declare reduction directive cannot be a function type, an
// array type, a reference type, or a type qualified with const, volatile or
// restrict.
if (ReductionType.hasQualifiers()) {
Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 0;
return QualType();
if (ReductionType->isFunctionType()) {
Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 1;
return QualType();
if (ReductionType->isReferenceType()) {
Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 2;
return QualType();
if (ReductionType->isArrayType()) {
Diag(TyLoc, diag::err_omp_reduction_wrong_type) << 3;
return QualType();
return ReductionType;
Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart(
Scope *S, DeclContext *DC, DeclarationName Name,
ArrayRef<std::pair<QualType, SourceLocation>> ReductionTypes,
AccessSpecifier AS, Decl *PrevDeclInScope) {
SmallVector<Decl *, 8> Decls;
LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPReductionName,
// [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions
// A reduction-identifier may not be re-declared in the current scope for the
// same type or for a type that is compatible according to the base language
// rules.
llvm::DenseMap<QualType, SourceLocation> PreviousRedeclTypes;
OMPDeclareReductionDecl *PrevDRD = nullptr;
bool InCompoundScope = true;
if (S != nullptr) {
// Find previous declaration with the same name not referenced in other
// declarations.
FunctionScopeInfo *ParentFn = getEnclosingFunction();
InCompoundScope =
(ParentFn != nullptr) && !ParentFn->CompoundScopes.empty();
LookupName(Lookup, S);
FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false,
llvm::DenseMap<OMPDeclareReductionDecl *, bool> UsedAsPrevious;
LookupResult::Filter Filter = Lookup.makeFilter();
while (Filter.hasNext()) {
auto *PrevDecl = cast<OMPDeclareReductionDecl>(;
if (InCompoundScope) {
auto I = UsedAsPrevious.find(PrevDecl);
if (I == UsedAsPrevious.end())
UsedAsPrevious[PrevDecl] = false;
if (OMPDeclareReductionDecl *D = PrevDecl->getPrevDeclInScope())
UsedAsPrevious[D] = true;
PreviousRedeclTypes[PrevDecl->getType().getCanonicalType()] =
if (InCompoundScope) {
for (const auto &PrevData : UsedAsPrevious) {
if (!PrevData.second) {
PrevDRD = PrevData.first;
} else if (PrevDeclInScope != nullptr) {
auto *PrevDRDInScope = PrevDRD =
do {
PreviousRedeclTypes[PrevDRDInScope->getType().getCanonicalType()] =
PrevDRDInScope = PrevDRDInScope->getPrevDeclInScope();
} while (PrevDRDInScope != nullptr);
for (const auto &TyData : ReductionTypes) {
const auto I = PreviousRedeclTypes.find(TyData.first.getCanonicalType());
bool Invalid = false;
if (I != PreviousRedeclTypes.end()) {
Diag(TyData.second, diag::err_omp_declare_reduction_redefinition)
<< TyData.first;
Diag(I->second, diag::note_previous_definition);
Invalid = true;
PreviousRedeclTypes[TyData.first.getCanonicalType()] = TyData.second;
auto *DRD = OMPDeclareReductionDecl::Create(Context, DC, TyData.second,
Name, TyData.first, PrevDRD);
if (Invalid)
PrevDRD = DRD;
return DeclGroupPtrTy::make(
DeclGroupRef::Create(Context, Decls.begin(), Decls.size()));
void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) {
auto *DRD = cast<OMPDeclareReductionDecl>(D);
// Enter new function scope.
if (S != nullptr)
PushDeclContext(S, DRD);
CurContext = DRD;
QualType ReductionType = DRD->getType();
// Create 'T* omp_parm;T omp_in;'. All references to 'omp_in' will
// be replaced by '*omp_parm' during codegen. This required because 'omp_in'
// uses semantics of argument handles by value, but it should be passed by
// reference. C lang does not support references, so pass all parameters as
// pointers.
// Create 'T omp_in;' variable.
VarDecl *OmpInParm =
buildVarDecl(*this, D->getLocation(), ReductionType, "omp_in");
// Create 'T* omp_parm;T omp_out;'. All references to 'omp_out' will
// be replaced by '*omp_parm' during codegen. This required because 'omp_out'
// uses semantics of argument handles by value, but it should be passed by
// reference. C lang does not support references, so pass all parameters as
// pointers.
// Create 'T omp_out;' variable.
VarDecl *OmpOutParm =
buildVarDecl(*this, D->getLocation(), ReductionType, "omp_out");
if (S != nullptr) {
PushOnScopeChains(OmpInParm, S);
PushOnScopeChains(OmpOutParm, S);
} else {
Expr *InE =
::buildDeclRefExpr(*this, OmpInParm, ReductionType, D->getLocation());
Expr *OutE =
::buildDeclRefExpr(*this, OmpOutParm, ReductionType, D->getLocation());
DRD->setCombinerData(InE, OutE);
void Sema::ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner) {
auto *DRD = cast<OMPDeclareReductionDecl>(D);
if (Combiner != nullptr)
VarDecl *Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D) {
auto *DRD = cast<OMPDeclareReductionDecl>(D);
// Enter new function scope.
if (S != nullptr)
PushDeclContext(S, DRD);
CurContext = DRD;
QualType ReductionType = DRD->getType();
// Create 'T* omp_parm;T omp_priv;'. All references to 'omp_priv' will
// be replaced by '*omp_parm' during codegen. This required because 'omp_priv'
// uses semantics of argument handles by value, but it should be passed by
// reference. C lang does not support references, so pass all parameters as
// pointers.
// Create 'T omp_priv;' variable.
VarDecl *OmpPrivParm =
buildVarDecl(*this, D->getLocation(), ReductionType, "omp_priv");
// Create 'T* omp_parm;T omp_orig;'. All references to 'omp_orig' will
// be replaced by '*omp_parm' during codegen. This required because 'omp_orig'
// uses semantics of argument handles by value, but it should be passed by
// reference. C lang does not support references, so pass all parameters as
// pointers.
// Create 'T omp_orig;' variable.
VarDecl *OmpOrigParm =
buildVarDecl(*this, D->getLocation(), ReductionType, "omp_orig");
if (S != nullptr) {
PushOnScopeChains(OmpPrivParm, S);
PushOnScopeChains(OmpOrigParm, S);
} else {
Expr *OrigE =
::buildDeclRefExpr(*this, OmpOrigParm, ReductionType, D->getLocation());
Expr *PrivE =
::buildDeclRefExpr(*this, OmpPrivParm, ReductionType, D->getLocation());
DRD->setInitializerData(OrigE, PrivE);
return OmpPrivParm;
void Sema::ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer,
VarDecl *OmpPrivParm) {
auto *DRD = cast<OMPDeclareReductionDecl>(D);
if (Initializer != nullptr) {
DRD->setInitializer(Initializer, OMPDeclareReductionDecl::CallInit);
} else if (OmpPrivParm->hasInit()) {
? OMPDeclareReductionDecl::DirectInit
: OMPDeclareReductionDecl::CopyInit);
} else {
Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveEnd(
Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid) {
for (Decl *D : DeclReductions.get()) {
if (IsValid) {
if (S)
PushOnScopeChains(cast<OMPDeclareReductionDecl>(D), S,
} else {
return DeclReductions;
TypeResult Sema::ActOnOpenMPDeclareMapperVarDecl(Scope *S, Declarator &D) {
TypeSourceInfo *TInfo = GetTypeForDeclarator(D, S);
QualType T = TInfo->getType();
if (D.isInvalidType())
return true;
if (getLangOpts().CPlusPlus) {
// Check that there are no default arguments (C++ only).
return CreateParsedType(T, TInfo);
QualType Sema::ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
TypeResult ParsedType) {
assert(ParsedType.isUsable() && "Expect usable parsed mapper type");
QualType MapperType = GetTypeFromParser(ParsedType.get());
assert(!MapperType.isNull() && "Expect valid mapper type");
// [OpenMP 5.0], declare mapper Directive, Restrictions
// The type must be of struct, union or class type in C and C++
if (!MapperType->isStructureOrClassType() && !MapperType->isUnionType()) {
Diag(TyLoc, diag::err_omp_mapper_wrong_type);
return QualType();
return MapperType;
OMPDeclareMapperDecl *Sema::ActOnOpenMPDeclareMapperDirectiveStart(
Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType,
SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS,
Decl *PrevDeclInScope) {
LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPMapperName,
// [OpenMP 5.0], declare mapper Directive, Restrictions
// A mapper-identifier may not be redeclared in the current scope for the
// same type or for a type that is compatible according to the base language
// rules.
llvm::DenseMap<QualType, SourceLocation> PreviousRedeclTypes;
OMPDeclareMapperDecl *PrevDMD = nullptr;
bool InCompoundScope = true;
if (S != nullptr) {
// Find previous declaration with the same name not referenced in other
// declarations.
FunctionScopeInfo *ParentFn = getEnclosingFunction();
InCompoundScope =
(ParentFn != nullptr) && !ParentFn->CompoundScopes.empty();
LookupName(Lookup, S);
FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false,
llvm::DenseMap<OMPDeclareMapperDecl *, bool> UsedAsPrevious;
LookupResult::Filter Filter = Lookup.makeFilter();
while (Filter.hasNext()) {
auto *PrevDecl = cast<OMPDeclareMapperDecl>(;
if (InCompoundScope) {
auto I = UsedAsPrevious.find(PrevDecl);
if (I == UsedAsPrevious.end())
UsedAsPrevious[PrevDecl] = false;
if (OMPDeclareMapperDecl *D = PrevDecl->getPrevDeclInScope())
UsedAsPrevious[D] = true;
PreviousRedeclTypes[PrevDecl->getType().getCanonicalType()] =
if (InCompoundScope) {
for (const auto &PrevData : UsedAsPrevious) {
if (!PrevData.second) {
PrevDMD = PrevData.first;
} else if (PrevDeclInScope) {
auto *PrevDMDInScope = PrevDMD =
do {
PreviousRedeclTypes[PrevDMDInScope->getType().getCanonicalType()] =
PrevDMDInScope = PrevDMDInScope->getPrevDeclInScope();
} while (PrevDMDInScope != nullptr);
const auto I = PreviousRedeclTypes.find(MapperType.getCanonicalType());
bool Invalid = false;
if (I != PreviousRedeclTypes.end()) {
Diag(StartLoc, diag::err_omp_declare_mapper_redefinition)
<< MapperType << Name;
Diag(I->second, diag::note_previous_definition);
Invalid = true;
auto *DMD = OMPDeclareMapperDecl::Create(Context, DC, StartLoc, Name,
MapperType, VN, PrevDMD);
if (Invalid)
// Enter new function scope.
CurContext = DMD;
return DMD;
void Sema::ActOnOpenMPDeclareMapperDirectiveVarDecl(OMPDeclareMapperDecl *DMD,
Scope *S,
QualType MapperType,
SourceLocation StartLoc,
DeclarationName VN) {
VarDecl *VD = buildVarDecl(*this, StartLoc, MapperType, VN.getAsString());
if (S)
PushOnScopeChains(VD, S);
Expr *MapperVarRefExpr = buildDeclRefExpr(*this, VD, MapperType, StartLoc);
Sema::ActOnOpenMPDeclareMapperDirectiveEnd(OMPDeclareMapperDecl *D, Scope *S,
ArrayRef<OMPClause *> ClauseList) {
if (D) {
if (S)
PushOnScopeChains(D, S, /*AddToContext=*/false);
D->CreateClauses(Context, ClauseList);
return DeclGroupPtrTy::make(DeclGroupRef(D));
OMPClause *Sema::ActOnOpenMPNumTeamsClause(Expr *NumTeams,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
Expr *ValExpr = NumTeams;
Stmt *HelperValStmt = nullptr;
// OpenMP [teams Constrcut, Restrictions]
// The num_teams expression must evaluate to a positive integer value.
if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_num_teams,
return nullptr;
OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
OpenMPDirectiveKind CaptureRegion =
getOpenMPCaptureRegionForClause(DKind, OMPC_num_teams, LangOpts.OpenMP);
if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
ValExpr = MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
HelperValStmt = buildPreInits(Context, Captures);
return new (Context) OMPNumTeamsClause(ValExpr, HelperValStmt, CaptureRegion,
StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPThreadLimitClause(Expr *ThreadLimit,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
Expr *ValExpr = ThreadLimit;
Stmt *HelperValStmt = nullptr;
// OpenMP [teams Constrcut, Restrictions]
// The thread_limit expression must evaluate to a positive integer value.
if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_thread_limit,
return nullptr;
OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
DKind, OMPC_thread_limit, LangOpts.OpenMP);
if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
ValExpr = MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
HelperValStmt = buildPreInits(Context, Captures);
return new (Context) OMPThreadLimitClause(
ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPPriorityClause(Expr *Priority,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
Expr *ValExpr = Priority;
Stmt *HelperValStmt = nullptr;
OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
// OpenMP [2.9.1, task Constrcut]
// The priority-value is a non-negative numerical scalar expression.
if (!isNonNegativeIntegerValue(
ValExpr, *this, OMPC_priority,
/*StrictlyPositive=*/false, /*BuildCapture=*/true,
DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt))
return nullptr;
return new (Context) OMPPriorityClause(ValExpr, HelperValStmt, CaptureRegion,
StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPGrainsizeClause(Expr *Grainsize,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
Expr *ValExpr = Grainsize;
Stmt *HelperValStmt = nullptr;
OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
// OpenMP [2.9.2, taskloop Constrcut]
// The parameter of the grainsize clause must be a positive integer
// expression.
if (!isNonNegativeIntegerValue(
ValExpr, *this, OMPC_grainsize,
/*StrictlyPositive=*/true, /*BuildCapture=*/true,
DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt))
return nullptr;
return new (Context) OMPGrainsizeClause(ValExpr, HelperValStmt, CaptureRegion,
StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPNumTasksClause(Expr *NumTasks,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
Expr *ValExpr = NumTasks;
Stmt *HelperValStmt = nullptr;
OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
// OpenMP [2.9.2, taskloop Constrcut]
// The parameter of the num_tasks clause must be a positive integer
// expression.
if (!isNonNegativeIntegerValue(
ValExpr, *this, OMPC_num_tasks,
/*StrictlyPositive=*/true, /*BuildCapture=*/true,
DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt))
return nullptr;
return new (Context) OMPNumTasksClause(ValExpr, HelperValStmt, CaptureRegion,
StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
// OpenMP [2.13.2, critical construct, Description]
// ... where hint-expression is an integer constant expression that evaluates
// to a valid lock hint.
ExprResult HintExpr = VerifyPositiveIntegerConstantInClause(Hint, OMPC_hint);
if (HintExpr.isInvalid())
return nullptr;
return new (Context)
OMPHintClause(HintExpr.get(), StartLoc, LParenLoc, EndLoc);
/// Tries to find omp_event_handle_t type.
static bool findOMPEventHandleT(Sema &S, SourceLocation Loc,
DSAStackTy *Stack) {
QualType OMPEventHandleT = Stack->getOMPEventHandleT();
if (!OMPEventHandleT.isNull())
return true;
IdentifierInfo *II = &S.PP.getIdentifierTable().get("omp_event_handle_t");
ParsedType PT = S.getTypeName(*II, Loc, S.getCurScope());
if (!PT.getAsOpaquePtr() || PT.get().isNull()) {
S.Diag(Loc, diag::err_omp_implied_type_not_found) << "omp_event_handle_t";
return false;
return true;
OMPClause *Sema::ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
if (!Evt->isValueDependent() && !Evt->isTypeDependent() &&
!Evt->isInstantiationDependent() &&
!Evt->containsUnexpandedParameterPack()) {
if (!findOMPEventHandleT(*this, Evt->getExprLoc(), DSAStack))
return nullptr;
// OpenMP 5.0, 2.10.1 task Construct.
// event-handle is a variable of the omp_event_handle_t type.
auto *Ref = dyn_cast<DeclRefExpr>(Evt->IgnoreParenImpCasts());
if (!Ref) {
Diag(Evt->getExprLoc(), diag::err_omp_var_expected)
<< "omp_event_handle_t" << 0 << Evt->getSourceRange();
return nullptr;
auto *VD = dyn_cast_or_null<VarDecl>(Ref->getDecl());
if (!VD) {
Diag(Evt->getExprLoc(), diag::err_omp_var_expected)
<< "omp_event_handle_t" << 0 << Evt->getSourceRange();
return nullptr;
if (!Context.hasSameUnqualifiedType(DSAStack->getOMPEventHandleT(),
VD->getType()) ||
VD->getType().isConstant(Context)) {
Diag(Evt->getExprLoc(), diag::err_omp_var_expected)
<< "omp_event_handle_t" << 1 << VD->getType()
<< Evt->getSourceRange();
return nullptr;
// OpenMP 5.0, 2.10.1 task Construct
// [detach clause]... The event-handle will be considered as if it was
// specified on a firstprivate clause.
DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(VD, /*FromParent=*/false);
if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_firstprivate &&
DVar.RefExpr) {
Diag(Evt->getExprLoc(), diag::err_omp_wrong_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_firstprivate);
reportOriginalDsa(*this, DSAStack, VD, DVar);
return nullptr;
return new (Context) OMPDetachClause(Evt, StartLoc, LParenLoc, EndLoc);
OMPClause *Sema::ActOnOpenMPDistScheduleClause(
OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
SourceLocation LParenLoc, SourceLocation KindLoc, SourceLocation CommaLoc,
SourceLocation EndLoc) {
if (Kind == OMPC_DIST_SCHEDULE_unknown) {
std::string Values;
Values += "'";
Values += getOpenMPSimpleClauseTypeName(OMPC_dist_schedule, 0);
Values += "'";
Diag(KindLoc, diag::err_omp_unexpected_clause_value)
<< Values << getOpenMPClauseName(OMPC_dist_schedule);
return nullptr;
Expr *ValExpr = ChunkSize;
Stmt *HelperValStmt = nullptr;
if (ChunkSize) {
if (!ChunkSize->isValueDependent() && !ChunkSize->isTypeDependent() &&
!ChunkSize->isInstantiationDependent() &&
!ChunkSize->containsUnexpandedParameterPack()) {
SourceLocation ChunkSizeLoc = ChunkSize->getBeginLoc();
ExprResult Val =
PerformOpenMPImplicitIntegerConversion(ChunkSizeLoc, ChunkSize);
if (Val.isInvalid())
return nullptr;
ValExpr = Val.get();
// OpenMP [2.7.1, Restrictions]
// chunk_size must be a loop invariant integer expression with a positive
// value.
llvm::APSInt Result;
if (ValExpr->isIntegerConstantExpr(Result, Context)) {
if (Result.isSigned() && !Result.isStrictlyPositive()) {
Diag(ChunkSizeLoc, diag::err_omp_negative_expression_in_clause)
<< "dist_schedule" << ChunkSize->getSourceRange();
return nullptr;
} else if (getOpenMPCaptureRegionForClause(
DSAStack->getCurrentDirective(), OMPC_dist_schedule,
LangOpts.OpenMP) != OMPD_unknown &&
!CurContext->isDependentContext()) {
ValExpr = MakeFullExpr(ValExpr).get();
llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
HelperValStmt = buildPreInits(Context, Captures);
return new (Context)
OMPDistScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc,
Kind, ValExpr, HelperValStmt);
OMPClause *Sema::ActOnOpenMPDefaultmapClause(
OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind,
SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc,
SourceLocation KindLoc, SourceLocation EndLoc) {
if (getLangOpts().OpenMP < 50) {
Kind != OMPC_DEFAULTMAP_scalar) {
std::string Value;
SourceLocation Loc;
Value += "'";
Value += getOpenMPSimpleClauseTypeName(OMPC_defaultmap,
Loc = MLoc;
} else {
Value += getOpenMPSimpleClauseTypeName(OMPC_defaultmap,
Loc = KindLoc;
Value += "'";
Diag(Loc, diag::err_omp_unexpected_clause_value)
<< Value << getOpenMPClauseName(OMPC_defaultmap);
return nullptr;
} else {
bool isDefaultmapModifier = (M != OMPC_DEFAULTMAP_MODIFIER_unknown);
bool isDefaultmapKind = (Kind != OMPC_DEFAULTMAP_unknown) ||
(LangOpts.OpenMP >= 50 && KindLoc.isInvalid());
if (!isDefaultmapKind || !isDefaultmapModifier) {
std::string ModifierValue = "'alloc', 'from', 'to', 'tofrom', "
"'firstprivate', 'none', 'default'";
std::string KindValue = "'scalar', 'aggregate', 'pointer'";
if (!isDefaultmapKind && isDefaultmapModifier) {
Diag(KindLoc, diag::err_omp_unexpected_clause_value)
<< KindValue << getOpenMPClauseName(OMPC_defaultmap);
} else if (isDefaultmapKind && !isDefaultmapModifier) {
Diag(MLoc, diag::err_omp_unexpected_clause_value)
<< ModifierValue << getOpenMPClauseName(OMPC_defaultmap);
} else {
Diag(MLoc, diag::err_omp_unexpected_clause_value)
<< ModifierValue << getOpenMPClauseName(OMPC_defaultmap);
Diag(KindLoc, diag::err_omp_unexpected_clause_value)
<< KindValue << getOpenMPClauseName(OMPC_defaultmap);
return nullptr;
// OpenMP [5.0, 2.12.5, Restrictions, p. 174]
// At most one defaultmap clause for each category can appear on the
// directive.
if (DSAStack->checkDefaultmapCategory(Kind)) {
Diag(StartLoc, diag::err_omp_one_defaultmap_each_category);
return nullptr;
if (Kind == OMPC_DEFAULTMAP_unknown) {
// Variable category is not specified - mark all categories.
DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_aggregate, StartLoc);
DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_scalar, StartLoc);
DSAStack->setDefaultDMAAttr(M, OMPC_DEFAULTMAP_pointer, StartLoc);
} else {
DSAStack->setDefaultDMAAttr(M, Kind, StartLoc);
return new (Context)
OMPDefaultmapClause(StartLoc, LParenLoc, MLoc, KindLoc, EndLoc, Kind, M);
bool Sema::ActOnStartOpenMPDeclareTargetDirective(SourceLocation Loc) {
DeclContext *CurLexicalContext = getCurLexicalContext();
if (!CurLexicalContext->isFileContext() &&
!CurLexicalContext->isExternCContext() &&
!CurLexicalContext->isExternCXXContext() &&
!isa<CXXRecordDecl>(CurLexicalContext) &&
!isa<ClassTemplateDecl>(CurLexicalContext) &&
!isa<ClassTemplatePartialSpecializationDecl>(CurLexicalContext) &&
!isa<ClassTemplateSpecializationDecl>(CurLexicalContext)) {
Diag(Loc, diag::err_omp_region_not_file_context);
return false;
return true;
void Sema::ActOnFinishOpenMPDeclareTargetDirective() {
assert(DeclareTargetNestingLevel > 0 &&
"Unexpected ActOnFinishOpenMPDeclareTargetDirective");
NamedDecl *
Sema::lookupOpenMPDeclareTargetName(Scope *CurScope, CXXScopeSpec &ScopeSpec,
const DeclarationNameInfo &Id,
NamedDeclSetType &SameDirectiveDecls) {
LookupResult Lookup(*this, Id, LookupOrdinaryName);
LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
if (Lookup.isAmbiguous())
return nullptr;
if (!Lookup.isSingleResult()) {
VarOrFuncDeclFilterCCC CCC(*this);
if (TypoCorrection Corrected =
CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr, CCC,
CTK_ErrorRecovery)) {
diagnoseTypo(Corrected, PDiag(diag::err_undeclared_var_use_suggest)
<< Id.getName());
checkDeclIsAllowedInOpenMPTarget(nullptr, Corrected.getCorrectionDecl());
return nullptr;
Diag(Id.getLoc(), diag::err_undeclared_var_use) << Id.getName();
return nullptr;
NamedDecl *ND = Lookup.getAsSingle<NamedDecl>();
if (!isa<VarDecl>(ND) && !isa<FunctionDecl>(ND) &&
!isa<FunctionTemplateDecl>(ND)) {
Diag(Id.getLoc(), diag::err_omp_invalid_target_decl) << Id.getName();
return nullptr;
if (!SameDirectiveDecls.insert(cast<NamedDecl>(ND->getCanonicalDecl())))
Diag(Id.getLoc(), diag::err_omp_declare_target_multiple) << Id.getName();
return ND;
void Sema::ActOnOpenMPDeclareTargetName(
NamedDecl *ND, SourceLocation Loc, OMPDeclareTargetDeclAttr::MapTypeTy MT,
OMPDeclareTargetDeclAttr::DevTypeTy DT) {
assert((isa<VarDecl>(ND) || isa<FunctionDecl>(ND) ||
isa<FunctionTemplateDecl>(ND)) &&
"Expected variable, function or function template.");
// Diagnose marking after use as it may lead to incorrect diagnosis and
// codegen.
if (LangOpts.OpenMP >= 50 &&
(ND->isUsed(/*CheckUsedAttr=*/false) || ND->isReferenced()))
Diag(Loc, diag::warn_omp_declare_target_after_first_use);
Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
if (DevTy.hasValue() && *DevTy != DT) {
Diag(Loc, diag::err_omp_device_type_mismatch)
<< OMPDeclareTargetDeclAttr::ConvertDevTypeTyToStr(DT)
<< OMPDeclareTargetDeclAttr::ConvertDevTypeTyToStr(*DevTy);
Optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
if (!Res) {
auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(Context, MT, DT,
SourceRange(Loc, Loc));
if (ASTMutationListener *ML = Context.getASTMutationListener())
ML->DeclarationMarkedOpenMPDeclareTarget(ND, A);
checkDeclIsAllowedInOpenMPTarget(nullptr, ND, Loc);
} else if (*Res != MT) {
Diag(Loc, diag::err_omp_declare_target_to_and_link) << ND;
static void checkDeclInTargetContext(SourceLocation SL, SourceRange SR,
Sema &SemaRef, Decl *D) {
if (!D || !isa<VarDecl>(D))
auto *VD = cast<VarDecl>(D);
Optional<OMPDeclareTargetDeclAttr::MapTypeTy> MapTy =
if (SemaRef.LangOpts.OpenMP >= 50 &&
(SemaRef.getCurLambda(/*IgnoreNonLambdaCapturingScope=*/true) ||
SemaRef.getCurBlock() || SemaRef.getCurCapturedRegion()) &&
VD->hasGlobalStorage()) {
llvm::Optional<OMPDeclareTargetDeclAttr::MapTypeTy> MapTy =
if (!MapTy || *MapTy != OMPDeclareTargetDeclAttr::MT_To) {
// OpenMP 5.0, 2.12.7 declare target Directive, Restrictions
// If a lambda declaration and definition appears between a
// declare target directive and the matching end declare target
// directive, all variables that are captured by the lambda
// expression must also appear in a to clause.
SemaRef.Diag(SL, diag::note_var_explicitly_captured_here)
<< VD << 0 << SR;
if (MapTy.hasValue())
SemaRef.Diag(VD->getLocation(), diag::warn_omp_not_in_target_context);
SemaRef.Diag(SL, diag::note_used_here) << SR;
static bool checkValueDeclInTarget(SourceLocation SL, SourceRange SR,
Sema &SemaRef, DSAStackTy *Stack,
ValueDecl *VD) {
return OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD) ||
checkTypeMappable(SL, SR, SemaRef, Stack, VD->getType(),
void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
SourceLocation IdLoc) {
if (!D || D->isInvalidDecl())
SourceRange SR = E ? E->getSourceRange() : D->getSourceRange();
SourceLocation SL = E ? E->getBeginLoc() : D->getLocation();
if (auto *VD = dyn_cast<VarDecl>(D)) {
// Only global variables can be marked as declare target.
if (!VD->isFileVarDecl() && !VD->isStaticLocal() &&
// 2.10.6: threadprivate variable cannot appear in a declare target
// directive.
if (DSAStack->isThreadPrivate(VD)) {
Diag(SL, diag::err_omp_threadprivate_in_target);
reportOriginalDsa(*this, DSAStack, VD, DSAStack->getTopDSA(VD, false));
if (const auto *FTD = dyn_cast<FunctionTemplateDecl>(D))
D = FTD->getTemplatedDecl();
if (auto *FD = dyn_cast<FunctionDecl>(D)) {
llvm::Optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
if (IdLoc.isValid() && Res && *Res == OMPDeclareTargetDeclAttr::MT_Link) {
Diag(IdLoc, diag::err_omp_function_in_link_clause);
Diag(FD->getLocation(), diag::note_defined_here) << FD;
if (auto *VD = dyn_cast<ValueDecl>(D)) {
// Problem if any with var declared with incomplete type will be reported
// as normal, so no need to check it here.
if ((E || !VD->getType()->isIncompleteType()) &&
!checkValueDeclInTarget(SL, SR, *this, DSAStack, VD))
if (!E && !OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) {
// Checking declaration inside declare target region.
if (isa<VarDecl>(D) || isa<FunctionDecl>(D) ||
isa<FunctionTemplateDecl>(D)) {
auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(
Context, OMPDeclareTargetDeclAttr::MT_To,
OMPDeclareTargetDeclAttr::DT_Any, SourceRange(IdLoc, IdLoc));
if (ASTMutationListener *ML = Context.getASTMutationListener())
ML->DeclarationMarkedOpenMPDeclareTarget(D, A);
if (!E)
checkDeclInTargetContext(E->getExprLoc(), E->getSourceRange(), *this, D);
OMPClause *Sema::ActOnOpenMPToClause(ArrayRef<Expr *> VarList,
CXXScopeSpec &MapperIdScopeSpec,
DeclarationNameInfo &MapperId,
const OMPVarListLocTy &Locs,
ArrayRef<Expr *> UnresolvedMappers) {
MappableVarListInfo MVLI(VarList);
checkMappableExpressionList(*this, DSAStack, OMPC_to, MVLI, Locs.StartLoc,
MapperIdScopeSpec, MapperId, UnresolvedMappers);
if (MVLI.ProcessedVarList.empty())
return nullptr;
return OMPToClause::Create(
Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
MVLI.VarComponents, MVLI.UDMapperList,
MapperIdScopeSpec.getWithLocInContext(Context), MapperId);
OMPClause *Sema::ActOnOpenMPFromClause(ArrayRef<Expr *> VarList,
CXXScopeSpec &MapperIdScopeSpec,
DeclarationNameInfo &MapperId,
const OMPVarListLocTy &Locs,
ArrayRef<Expr *> UnresolvedMappers) {
MappableVarListInfo MVLI(VarList);
checkMappableExpressionList(*this, DSAStack, OMPC_from, MVLI, Locs.StartLoc,
MapperIdScopeSpec, MapperId, UnresolvedMappers);
if (MVLI.ProcessedVarList.empty())
return nullptr;
return OMPFromClause::Create(
Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
MVLI.VarComponents, MVLI.UDMapperList,
MapperIdScopeSpec.getWithLocInContext(Context), MapperId);
OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
const OMPVarListLocTy &Locs) {
MappableVarListInfo MVLI(VarList);
SmallVector<Expr *, 8> PrivateCopies;
SmallVector<Expr *, 8> Inits;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP use_device_ptr clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
QualType Type = D->getType();
Type = Type.getNonReferenceType().getUnqualifiedType();
auto *VD = dyn_cast<VarDecl>(D);
// Item should be a pointer or reference to pointer.
if (!Type->isPointerType()) {
Diag(ELoc, diag::err_omp_usedeviceptr_not_a_pointer)
<< 0 << RefExpr->getSourceRange();
// Build the private variable and the expression that refers to it.
auto VDPrivate =
buildVarDecl(*this, ELoc, Type, D->getName(),
D->hasAttrs() ? &D->getAttrs() : nullptr,
VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
if (VDPrivate->isInvalidDecl())
DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
*this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);
// Add temporary variable to initialize the private copy of the pointer.
VarDecl *VDInit =
buildVarDecl(*this, RefExpr->getExprLoc(), Type, ".devptr.temp");
DeclRefExpr *VDInitRefExpr = buildDeclRefExpr(
*this, VDInit, RefExpr->getType(), RefExpr->getExprLoc());
// If required, build a capture to implement the privatization initialized
// with the current list item value.
DeclRefExpr *Ref = nullptr;
if (!VD)
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref);
// We need to add a data sharing attribute for this variable to make sure it
// is correctly captured. A variable that shows up in a use_device_ptr has
// similar properties of a first private variable.
DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);
// Create a mappable component for the list item. List items in this clause
// only need a component.
MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
OMPClauseMappableExprCommon::MappableComponent(SimpleRefExpr, D));
if (MVLI.ProcessedVarList.empty())
return nullptr;
return OMPUseDevicePtrClause::Create(
Context, Locs, MVLI.ProcessedVarList, PrivateCopies, Inits,
MVLI.VarBaseDeclarations, MVLI.VarComponents);
OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
const OMPVarListLocTy &Locs) {
MappableVarListInfo MVLI(VarList);
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP use_device_addr clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
auto *VD = dyn_cast<VarDecl>(D);
// If required, build a capture to implement the privatization initialized
// with the current list item value.
DeclRefExpr *Ref = nullptr;
if (!VD)
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref);
// We need to add a data sharing attribute for this variable to make sure it
// is correctly captured. A variable that shows up in a use_device_addr has
// similar properties of a first private variable.
DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);
// Create a mappable component for the list item. List items in this clause
// only need a component.
Expr *Component = SimpleRefExpr;
if (VD && (isa<OMPArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) ||
Component = DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
OMPClauseMappableExprCommon::MappableComponent(Component, D));
if (MVLI.ProcessedVarList.empty())
return nullptr;
return OMPUseDeviceAddrClause::Create(Context, Locs, MVLI.ProcessedVarList,
OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
const OMPVarListLocTy &Locs) {
MappableVarListInfo MVLI(VarList);
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP is_device_ptr clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
QualType Type = D->getType();
// item should be a pointer or array or reference to pointer or array
if (!Type.getNonReferenceType()->isPointerType() &&
!Type.getNonReferenceType()->isArrayType()) {
Diag(ELoc, diag::err_omp_argument_type_isdeviceptr)
<< 0 << RefExpr->getSourceRange();
// Check if the declaration in the clause does not show up in any data
// sharing attribute.
DSAStackTy::DSAVarData DVar = DSAStack->getTopDSA(D, /*FromParent=*/false);
if (isOpenMPPrivate(DVar.CKind)) {
Diag(ELoc, diag::err_omp_variable_in_given_clause_and_dsa)
<< getOpenMPClauseName(DVar.CKind)
<< getOpenMPClauseName(OMPC_is_device_ptr)
<< getOpenMPDirectiveName(DSAStack->getCurrentDirective());
reportOriginalDsa(*this, DSAStack, D, DVar);
const Expr *ConflictExpr;
if (DSAStack->checkMappableExprComponentListsForDecl(
D, /*CurrentRegionOnly=*/true,
OMPClauseMappableExprCommon::MappableExprComponentListRef R,
OpenMPClauseKind) -> bool {
ConflictExpr = R.front().getAssociatedExpression();
return true;
})) {
Diag(ELoc, diag::err_omp_map_shared_storage) << RefExpr->getSourceRange();
Diag(ConflictExpr->getExprLoc(), diag::note_used_here)
<< ConflictExpr->getSourceRange();
// Store the components in the stack so that they can be used to check
// against other clauses later on.
OMPClauseMappableExprCommon::MappableComponent MC(SimpleRefExpr, D);
D, MC, /*WhereFoundClauseKind=*/OMPC_is_device_ptr);
// Record the expression we've just processed.
// Create a mappable component for the list item. List items in this clause
// only need a component. We use a null declaration to signal fields in
// 'this'.
assert((isa<DeclRefExpr>(SimpleRefExpr) ||
isa<CXXThisExpr>(cast<MemberExpr>(SimpleRefExpr)->getBase())) &&
"Unexpected device pointer expression!");
isa<DeclRefExpr>(SimpleRefExpr) ? D : nullptr);
MVLI.VarComponents.resize(MVLI.VarComponents.size() + 1);
if (MVLI.ProcessedVarList.empty())
return nullptr;
return OMPIsDevicePtrClause::Create(Context, Locs, MVLI.ProcessedVarList,
OMPClause *Sema::ActOnOpenMPAllocateClause(
Expr *Allocator, ArrayRef<Expr *> VarList, SourceLocation StartLoc,
SourceLocation ColonLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
if (Allocator) {
// OpenMP [2.11.4 allocate Clause, Description]
// allocator is an expression of omp_allocator_handle_t type.
if (!findOMPAllocatorHandleT(*this, Allocator->getExprLoc(), DSAStack))
return nullptr;
ExprResult AllocatorRes = DefaultLvalueConversion(Allocator);
if (AllocatorRes.isInvalid())
return nullptr;
AllocatorRes = PerformImplicitConversion(AllocatorRes.get(),
if (AllocatorRes.isInvalid())
return nullptr;
Allocator = AllocatorRes.get();
} else {
// OpenMP 5.0, 2.11.4 allocate Clause, Restrictions.
// allocate clauses that appear on a target construct or on constructs in a
// target region must specify an allocator expression unless a requires
// directive with the dynamic_allocators clause is present in the same
// compilation unit.
if (LangOpts.OpenMPIsDevice &&
targetDiag(StartLoc, diag::err_expected_allocator_expression);
// Analyze and build list of variables.
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP private clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second) {
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
auto *VD = dyn_cast<VarDecl>(D);
DeclRefExpr *Ref = nullptr;
if (!VD && !CurContext->isDependentContext())
Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
Vars.push_back((VD || CurContext->isDependentContext())
? RefExpr->IgnoreParens()
: Ref);
if (Vars.empty())
return nullptr;
if (Allocator)
return OMPAllocateClause::Create(Context, StartLoc, LParenLoc, Allocator,
ColonLoc, EndLoc, Vars);
OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
if (Res.second)
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
// OpenMP 5.0, simd Construct, Restrictions.
// A list-item cannot appear in more than one nontemporal clause.
if (const Expr *PrevRef =
DSAStack->addUniqueNontemporal(D, SimpleRefExpr)) {
Diag(ELoc, diag::err_omp_used_in_clause_twice)
<< 0 << getOpenMPClauseName(OMPC_nontemporal) << ERange;
Diag(PrevRef->getExprLoc(), diag::note_omp_explicit_dsa)
<< getOpenMPClauseName(OMPC_nontemporal);
if (Vars.empty())
return nullptr;
return OMPNontemporalClause::Create(Context, StartLoc, LParenLoc, EndLoc,
OMPClause *Sema::ActOnOpenMPInclusiveClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
if (Res.second)
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
const DSAStackTy::DSAVarData DVar =
DSAStack->getTopDSA(D, /*FromParent=*/true);
// OpenMP 5.0, 2.9.6, scan Directive, Restrictions.
// A list item that appears in the inclusive or exclusive clause must appear
// in a reduction clause with the inscan modifier on the enclosing
// worksharing-loop, worksharing-loop SIMD, or simd construct.
if (DVar.CKind != OMPC_reduction ||
DVar.Modifier != OMPC_REDUCTION_inscan)
Diag(ELoc, diag::err_omp_inclusive_exclusive_not_reduction)
<< RefExpr->getSourceRange();
if (DSAStack->getParentDirective() != OMPD_unknown)
if (Vars.empty())
return nullptr;
return OMPInclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
OMPClause *Sema::ActOnOpenMPExclusiveClause(ArrayRef<Expr *> VarList,
SourceLocation StartLoc,
SourceLocation LParenLoc,
SourceLocation EndLoc) {
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : VarList) {
assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
SourceLocation ELoc;
SourceRange ERange;
Expr *SimpleRefExpr = RefExpr;
auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
if (Res.second)
// It will be analyzed later.
ValueDecl *D = Res.first;
if (!D)
OpenMPDirectiveKind ParentDirective = DSAStack->getParentDirective();
DSAStackTy::DSAVarData DVar;
if (ParentDirective != OMPD_unknown)
DVar = DSAStack->getTopDSA(D, /*FromParent=*/true);
// OpenMP 5.0, 2.9.6, scan Directive, Restrictions.
// A list item that appears in the inclusive or exclusive clause must appear
// in a reduction clause with the inscan modifier on the enclosing
// worksharing-loop, worksharing-loop SIMD, or simd construct.
if (ParentDirective == OMPD_unknown || DVar.CKind != OMPC_reduction ||
DVar.Modifier != OMPC_REDUCTION_inscan) {
Diag(ELoc, diag::err_omp_inclusive_exclusive_not_reduction)
<< RefExpr->getSourceRange();
} else {
if (Vars.empty())
return nullptr;
return OMPExclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
/// Tries to find omp_alloctrait_t type.
static bool findOMPAlloctraitT(Sema &S, SourceLocation Loc, DSAStackTy *Stack) {
QualType OMPAlloctraitT = Stack->getOMPAlloctraitT();
if (!OMPAlloctraitT.isNull())
return true;
IdentifierInfo &II = S.PP.getIdentifierTable().get("omp_alloctrait_t");
ParsedType PT = S.getTypeName(II, Loc, S.getCurScope());
if (!PT.getAsOpaquePtr() || PT.get().isNull()) {
S.Diag(Loc, diag::err_omp_implied_type_not_found) << "omp_alloctrait_t";
return false;
return true;
OMPClause *Sema::ActOnOpenMPUsesAllocatorClause(
SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc,
ArrayRef<UsesAllocatorsData> Data) {
// OpenMP [2.12.5, target Construct]
// allocator is an identifier of omp_allocator_handle_t type.
if (!findOMPAllocatorHandleT(*this, StartLoc, DSAStack))
return nullptr;
// OpenMP [2.12.5, target Construct]
// allocator-traits-array is an identifier of const omp_alloctrait_t * type.
if (llvm::any_of(
[](const UsesAllocatorsData &D) { return D.AllocatorTraits; }) &&
!findOMPAlloctraitT(*this, StartLoc, DSAStack))
return nullptr;
llvm::SmallSet<CanonicalDeclPtr<Decl>, 4> PredefinedAllocators;
for (int I = 0; I < OMPAllocateDeclAttr::OMPUserDefinedMemAlloc; ++I) {
auto AllocatorKind = static_cast<OMPAllocateDeclAttr::AllocatorTypeTy>(I);
StringRef Allocator =
DeclarationName AllocatorName = &Context.Idents.get(Allocator);
TUScope, AllocatorName, StartLoc, Sema::LookupAnyName));
SmallVector<OMPUsesAllocatorsClause::Data, 4> NewData;
for (const UsesAllocatorsData &D : Data) {
Expr *AllocatorExpr = nullptr;
// Check allocator expression.
if (D.Allocator->isTypeDependent()) {
AllocatorExpr = D.Allocator;
} else {
// Traits were specified - need to assign new allocator to the specified
// allocator, so it must be an lvalue.
AllocatorExpr = D.Allocator->IgnoreParenImpCasts();
auto *DRE = dyn_cast<DeclRefExpr>(AllocatorExpr);
bool IsPredefinedAllocator = false;
if (DRE)
IsPredefinedAllocator = PredefinedAllocators.count(DRE->getDecl());
if (!DRE ||
AllocatorExpr->getType(), DSAStack->getOMPAllocatorHandleT()) ||
/*CompareUnqualified=*/true)) ||
(!IsPredefinedAllocator &&
(AllocatorExpr->getType().isConstant(Context) ||
!AllocatorExpr->isLValue()))) {
Diag(D.Allocator->getExprLoc(), diag::err_omp_var_expected)
<< "omp_allocator_handle_t" << (DRE ? 1 : 0)
<< AllocatorExpr->getType() << D.Allocator->getSourceRange();
// OpenMP [2.12.5, target Construct]
// Predefined allocators appearing in a uses_allocators clause cannot have
// traits specified.
if (IsPredefinedAllocator && D.AllocatorTraits) {
<< D.AllocatorTraits->getSourceRange();
Diag(D.Allocator->getExprLoc(), diag::note_omp_predefined_allocator)
<< cast<NamedDecl>(DRE->getDecl())->getName()
<< D.Allocator->getSourceRange();
// OpenMP [2.12.5, target Construct]
// Non-predefined allocators appearing in a uses_allocators clause must
// have traits specified.
if (!IsPredefinedAllocator && !D.AllocatorTraits) {
// No allocator traits - just convert it to rvalue.
if (!D.AllocatorTraits)
AllocatorExpr = DefaultLvalueConversion(AllocatorExpr).get();
? DSAStackTy::UsesAllocatorsDeclKind::PredefinedAllocator
: DSAStackTy::UsesAllocatorsDeclKind::UserDefinedAllocator);
Expr *AllocatorTraitsExpr = nullptr;
if (D.AllocatorTraits) {
if (D.AllocatorTraits->isTypeDependent()) {
AllocatorTraitsExpr = D.AllocatorTraits;
} else {
// OpenMP [2.12.5, target Construct]
// Arrays that contain allocator traits that appear in a uses_allocators
// clause must be constant arrays, have constant values and be defined
// in the same scope as the construct in which the clause appears.
AllocatorTraitsExpr = D.AllocatorTraits->IgnoreParenImpCasts();
// Check that traits expr is a constant array.
QualType TraitTy;
if (const ArrayType *Ty =
if (const auto *ConstArrayTy = dyn_cast<ConstantArrayType>(Ty))
TraitTy = ConstArrayTy->getElementType();
if (TraitTy.isNull() ||
DSAStack->getOMPAlloctraitT()) ||
Context.typesAreCompatible(TraitTy, DSAStack->getOMPAlloctraitT(),
/*CompareUnqualified=*/true))) {
<< AllocatorTraitsExpr->getType();
// Do not map by default allocator traits if it is a standalone
// variable.
if (auto *DRE = dyn_cast<DeclRefExpr>(AllocatorTraitsExpr))
OMPUsesAllocatorsClause::Data &NewD = NewData.emplace_back();
NewD.Allocator = AllocatorExpr;
NewD.AllocatorTraits = AllocatorTraitsExpr;
NewD.LParenLoc = D.LParenLoc;
NewD.RParenLoc = D.RParenLoc;
return OMPUsesAllocatorsClause::Create(Context, StartLoc, LParenLoc, EndLoc,
OMPClause *Sema::ActOnOpenMPAffinityClause(
SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc,
SourceLocation EndLoc, Expr *Modifier, ArrayRef<Expr *> Locators) {
SmallVector<Expr *, 8> Vars;
for (Expr *RefExpr : Locators) {
assert(RefExpr && "NULL expr in OpenMP shared clause.");
if (isa<DependentScopeDeclRefExpr>(RefExpr) || RefExpr->isTypeDependent()) {
// It will be analyzed later.
SourceLocation ELoc = RefExpr->getExprLoc();
Expr *SimpleExpr = RefExpr->IgnoreParenImpCasts();
if (!SimpleExpr->isLValue()) {
Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
<< 1 << 0 << RefExpr->getSourceRange();
ExprResult Res;
Sema::TentativeAnalysisScope Trap(*this);
Res = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, SimpleExpr);
if (!Res.isUsable() && !isa<OMPArraySectionExpr>(SimpleExpr) &&
!isa<OMPArrayShapingExpr>(SimpleExpr)) {
Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
<< 1 << 0 << RefExpr->getSourceRange();
return OMPAffinityClause::Create(Context, StartLoc, LParenLoc, ColonLoc,
EndLoc, Modifier, Vars);
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 72d826b8bd17..7c439176f3a4 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -1,247 +1,248 @@
//===- Config.h -------------------------------------------------*- C++ -*-===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Object/COFF.h"
#include "llvm/Support/CachePruning.h"
#include <cstdint>
#include <map>
#include <set>
#include <string>
namespace lld {
namespace coff {
using llvm::COFF::WindowsSubsystem;
using llvm::StringRef;
class DefinedAbsolute;
class DefinedRelative;
class StringChunk;
class Symbol;
class InputFile;
// Short aliases.
static const auto AMD64 = llvm::COFF::IMAGE_FILE_MACHINE_AMD64;
static const auto ARM64 = llvm::COFF::IMAGE_FILE_MACHINE_ARM64;
static const auto ARMNT = llvm::COFF::IMAGE_FILE_MACHINE_ARMNT;
static const auto I386 = llvm::COFF::IMAGE_FILE_MACHINE_I386;
// Represents an /export option.
struct Export {
StringRef name; // N in /export:N or /export:E=N
StringRef extName; // E in /export:E=N
Symbol *sym = nullptr;
uint16_t ordinal = 0;
bool noname = false;
bool data = false;
bool isPrivate = false;
bool constant = false;
// If an export is a form of /, that means
// that foo should be exported as an alias to bar in the DLL.
// forwardTo is set to "" part. Usually empty.
StringRef forwardTo;
StringChunk *forwardChunk = nullptr;
// True if this /export option was in .drectves section.
bool directives = false;
StringRef symbolName;
StringRef exportName; // Name in DLL
bool operator==(const Export &e) {
return (name == && extName == e.extName &&
ordinal == e.ordinal && noname == e.noname &&
data == && isPrivate == e.isPrivate);
enum class DebugType {
None = 0x0,
CV = 0x1, /// CodeView
PData = 0x2, /// Procedure Data
Fixup = 0x4, /// Relocation Table
enum class GuardCFLevel {
NoLongJmp, // Emit gfids but no longjmp tables
Full, // Enable all protections.
// Global configuration.
struct Configuration {
enum ManifestKind { SideBySide, Embed, No };
bool is64() { return machine == AMD64 || machine == ARM64; }
llvm::COFF::MachineTypes machine = IMAGE_FILE_MACHINE_UNKNOWN;
size_t wordsize;
bool verbose = false;
WindowsSubsystem subsystem = llvm::COFF::IMAGE_SUBSYSTEM_UNKNOWN;
Symbol *entry = nullptr;
bool noEntry = false;
std::string outputFile;
std::string importName;
bool demangle = true;
bool doGC = true;
bool doICF = true;
bool tailMerge;
bool relocatable = true;
bool forceMultiple = false;
bool forceMultipleRes = false;
bool forceUnresolved = false;
bool debug = false;
bool debugDwarf = false;
bool debugGHashes = false;
bool debugSymtab = false;
bool driver = false;
bool driverUponly = false;
bool driverWdm = false;
bool showTiming = false;
bool showSummary = false;
unsigned debugTypes = static_cast<unsigned>(DebugType::None);
std::vector<std::string> natvisFiles;
llvm::StringMap<std::string> namedStreams;
llvm::SmallString<128> pdbAltPath;
llvm::SmallString<128> pdbPath;
llvm::SmallString<128> pdbSourcePath;
std::vector<llvm::StringRef> argv;
// Symbols in this set are considered as live by the garbage collector.
std::vector<Symbol *> gcroot;
std::set<std::string> noDefaultLibs;
bool noDefaultLibAll = false;
// True if we are creating a DLL.
bool dll = false;
StringRef implib;
std::vector<Export> exports;
bool hadExplicitExports;
std::set<std::string> delayLoads;
std::map<std::string, int> dllOrder;
Symbol *delayLoadHelper = nullptr;
bool saveTemps = false;
// /guard:cf
GuardCFLevel guardCF = GuardCFLevel::Off;
// Used for SafeSEH.
bool safeSEH = false;
Symbol *sehTable = nullptr;
Symbol *sehCount = nullptr;
+ bool noSEH = false;
// Used for /opt:lldlto=N
unsigned ltoo = 2;
// Used for /opt:lldltojobs=N
std::string thinLTOJobs;
// Used for /opt:lldltopartitions=N
unsigned ltoPartitions = 1;
// Used for /opt:lldltocache=path
StringRef ltoCache;
// Used for /opt:lldltocachepolicy=policy
llvm::CachePruningPolicy ltoCachePolicy;
// Used for /merge:from=to (e.g. /merge:.rdata=.text)
std::map<StringRef, StringRef> merge;
// Used for /,{DEKPRSW} to set section attributes.
std::map<StringRef, uint32_t> section;
// Options for manifest files.
ManifestKind manifest = No;
int manifestID = 1;
StringRef manifestDependency;
bool manifestUAC = true;
std::vector<std::string> manifestInput;
StringRef manifestLevel = "'asInvoker'";
StringRef manifestUIAccess = "'false'";
StringRef manifestFile;
// Used for /aligncomm.
std::map<std::string, int> alignComm;
// Used for /failifmismatch.
std::map<StringRef, std::pair<StringRef, InputFile *>> mustMatch;
// Used for /alternatename.
std::map<StringRef, StringRef> alternateNames;
// Used for /order.
llvm::StringMap<int> order;
// Used for /lldmap.
std::string lldmapFile;
// Used for /map.
std::string mapFile;
// Used for /thinlto-index-only:
llvm::StringRef thinLTOIndexOnlyArg;
// Used for /thinlto-object-prefix-replace:
std::pair<llvm::StringRef, llvm::StringRef> thinLTOPrefixReplace;
// Used for /thinlto-object-suffix-replace:
std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;
// Used for /lto-obj-path:
llvm::StringRef ltoObjPath;
uint64_t align = 4096;
uint64_t imageBase = -1;
uint64_t fileAlign = 512;
uint64_t stackReserve = 1024 * 1024;
uint64_t stackCommit = 4096;
uint64_t heapReserve = 1024 * 1024;
uint64_t heapCommit = 4096;
uint32_t majorImageVersion = 0;
uint32_t minorImageVersion = 0;
uint32_t majorOSVersion = 6;
uint32_t minorOSVersion = 0;
uint32_t timestamp = 0;
uint32_t functionPadMin = 0;
bool dynamicBase = true;
bool allowBind = true;
bool cetCompat = false;
bool nxCompat = true;
bool allowIsolation = true;
bool terminalServerAware = true;
bool largeAddressAware = false;
bool highEntropyVA = false;
bool appContainer = false;
bool mingw = false;
bool warnMissingOrderSymbol = true;
bool warnLocallyDefinedImported = true;
bool warnDebugInfoUnusable = true;
bool warnLongSectionNames = true;
bool incremental = true;
bool integrityCheck = false;
bool killAt = false;
bool repro = false;
bool swaprunCD = false;
bool swaprunNet = false;
bool thinLTOEmitImportsFiles;
bool thinLTOIndexOnly;
bool autoImport = false;
bool pseudoRelocs = false;
extern Configuration *config;
} // namespace coff
} // namespace lld
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 7372505bb616..9ceccef86779 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1,2052 +1,2053 @@
//===- Driver.cpp ---------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "Driver.h"
#include "Config.h"
#include "DebugTypes.h"
#include "ICF.h"
#include "InputFiles.h"
#include "MarkLive.h"
#include "MinGW.h"
#include "SymbolTable.h"
#include "Symbols.h"
#include "Writer.h"
#include "lld/Common/Args.h"
#include "lld/Common/Driver.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Filesystem.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Timer.h"
#include "lld/Common/Version.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/BinaryFormat/Magic.h"
#include "llvm/LTO/LTO.h"
#include "llvm/Object/ArchiveWriter.h"
#include "llvm/Object/COFFImportFile.h"
#include "llvm/Object/COFFModuleDefinition.h"
#include "llvm/Object/WindowsMachineFlag.h"
#include "llvm/Option/Arg.h"
#include "llvm/Option/ArgList.h"
#include "llvm/Option/Option.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/TarWriter.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
#include <algorithm>
#include <future>
#include <memory>
using namespace llvm;
using namespace llvm::object;
using namespace llvm::COFF;
using llvm::sys::Process;
namespace lld {
namespace coff {
static Timer inputFileTimer("Input File Reading", Timer::root());
Configuration *config;
LinkerDriver *driver;
bool link(ArrayRef<const char *> args, bool canExitEarly, raw_ostream &stdoutOS,
raw_ostream &stderrOS) {
lld::stdoutOS = &stdoutOS;
lld::stderrOS = &stderrOS;
errorHandler().logName = args::getFilenameWithoutExe(args[0]);
errorHandler().errorLimitExceededMsg =
"too many errors emitted, stopping now"
" (use /errorlimit:0 to see all errors)";
errorHandler().exitEarly = canExitEarly;
config = make<Configuration>();
symtab = make<SymbolTable>();
driver = make<LinkerDriver>();
// Call exit() if we can to avoid calling destructors.
if (canExitEarly)
exitLld(errorCount() ? 1 : 0);
memset(MergeChunk::instances, 0, sizeof(MergeChunk::instances));
return !errorCount();
// Parse options of the form "old;new".
static std::pair<StringRef, StringRef> getOldNewOptions(opt::InputArgList &args,
unsigned id) {
auto *arg = args.getLastArg(id);
if (!arg)
return {"", ""};
StringRef s = arg->getValue();
std::pair<StringRef, StringRef> ret = s.split(';');
if (ret.second.empty())
error(arg->getSpelling() + " expects 'old;new' format, but got " + s);
return ret;
// Drop directory components and replace extension with
// ".exe", ".dll" or ".sys".
static std::string getOutputPath(StringRef path) {
StringRef ext = ".exe";
if (config->dll)
ext = ".dll";
else if (config->driver)
ext = ".sys";
return (sys::path::stem(path) + ext).str();
// Returns true if S matches /crtend.?\.o$/.
static bool isCrtend(StringRef s) {
if (!s.endswith(".o"))
return false;
s = s.drop_back(2);
if (s.endswith("crtend"))
return true;
return !s.empty() && s.drop_back().endswith("crtend");
// ErrorOr is not default constructible, so it cannot be used as the type
// parameter of a future.
// FIXME: We could open the file in createFutureForFile and avoid needing to
// return an error here, but for the moment that would cost us a file descriptor
// (a limited resource on Windows) for the duration that the future is pending.
using MBErrPair = std::pair<std::unique_ptr<MemoryBuffer>, std::error_code>;
// Create a std::future that opens and maps a file using the best strategy for
// the host platform.
static std::future<MBErrPair> createFutureForFile(std::string path) {
#if _WIN32
// On Windows, file I/O is relatively slow so it is best to do this
// asynchronously.
auto strategy = std::launch::async;
auto strategy = std::launch::deferred;
return std::async(strategy, [=]() {
auto mbOrErr = MemoryBuffer::getFile(path,
/*FileSize*/ -1,
/*RequiresNullTerminator*/ false);
if (!mbOrErr)
return MBErrPair{nullptr, mbOrErr.getError()};
return MBErrPair{std::move(*mbOrErr), std::error_code()};
// Symbol names are mangled by prepending "_" on x86.
static StringRef mangle(StringRef sym) {
assert(config->machine != IMAGE_FILE_MACHINE_UNKNOWN);
if (config->machine == I386)
return"_" + sym);
return sym;
static bool findUnderscoreMangle(StringRef sym) {
Symbol *s = symtab->findMangle(mangle(sym));
return s && !isa<Undefined>(s);
MemoryBufferRef LinkerDriver::takeBuffer(std::unique_ptr<MemoryBuffer> mb) {
MemoryBufferRef mbref = *mb;
make<std::unique_ptr<MemoryBuffer>>(std::move(mb)); // take ownership
if (driver->tar)
return mbref;
void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
bool wholeArchive, bool lazy) {
StringRef filename = mb->getBufferIdentifier();
MemoryBufferRef mbref = takeBuffer(std::move(mb));
// File type is detected by contents, not by file extension.
switch (identify_magic(mbref.getBuffer())) {
case file_magic::windows_resource:
case file_magic::archive:
if (wholeArchive) {
std::unique_ptr<Archive> file =
CHECK(Archive::create(mbref), filename + ": failed to parse archive");
Archive *archive = file.get();
make<std::unique_ptr<Archive>>(std::move(file)); // take ownership
int memberIndex = 0;
for (MemoryBufferRef m : getArchiveMembers(archive))
addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
case file_magic::bitcode:
if (lazy)
symtab->addFile(make<BitcodeFile>(mbref, "", 0));
case file_magic::coff_object:
case file_magic::coff_import_library:
if (lazy)
case file_magic::pdb:
case file_magic::coff_cl_gl_object:
error(filename + ": is not a native COFF file. Recompile without /GL");
case file_magic::pecoff_executable:
if (filename.endswith_lower(".dll")) {
error(filename + ": bad file type. Did you specify a DLL instead of an "
"import library?");
error(mbref.getBufferIdentifier() + ": unknown file type");
void LinkerDriver::enqueuePath(StringRef path, bool wholeArchive, bool lazy) {
auto future = std::make_shared<std::future<MBErrPair>>(
std::string pathStr = std::string(path);
enqueueTask([=]() {
auto mbOrErr = future->get();
if (mbOrErr.second) {
std::string msg =
"could not open '" + pathStr + "': " + mbOrErr.second.message();
// Check if the filename is a typo for an option flag. OptTable thinks
// that all args that are not known options and that start with / are
// filenames, but e.g. `/nodefaultlibs` is more likely a typo for
// the option `/nodefaultlib` than a reference to a file in the root
// directory.
std::string nearest;
if (optTable.findNearest(pathStr, nearest) > 1)
error(msg + "; did you mean '" + nearest + "'");
} else
driver->addBuffer(std::move(mbOrErr.first), wholeArchive, lazy);
void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
StringRef parentName,
uint64_t offsetInArchive) {
file_magic magic = identify_magic(mb.getBuffer());
if (magic == file_magic::coff_import_library) {
InputFile *imp = make<ImportFile>(mb);
imp->parentName = parentName;
InputFile *obj;
if (magic == file_magic::coff_object) {
obj = make<ObjFile>(mb);
} else if (magic == file_magic::bitcode) {
obj = make<BitcodeFile>(mb, parentName, offsetInArchive);
} else {
error("unknown file type: " + mb.getBufferIdentifier());
obj->parentName = parentName;
log("Loaded " + toString(obj) + " for " + symName);
void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
const Archive::Symbol &sym,
StringRef parentName) {
auto reportBufferError = [=](Error &&e, StringRef childName) {
fatal("could not get the buffer for the member defining symbol " +
toCOFFString(sym) + ": " + parentName + "(" + childName + "): " +
if (!c.getParent()->isThin()) {
uint64_t offsetInArchive = c.getChildOffset();
Expected<MemoryBufferRef> mbOrErr = c.getMemoryBufferRef();
if (!mbOrErr)
reportBufferError(mbOrErr.takeError(), check(c.getFullName()));
MemoryBufferRef mb = mbOrErr.get();
enqueueTask([=]() {
driver->addArchiveBuffer(mb, toCOFFString(sym), parentName,
std::string childName = CHECK(
"could not get the filename for the member defining symbol " +
auto future = std::make_shared<std::future<MBErrPair>>(
enqueueTask([=]() {
auto mbOrErr = future->get();
if (mbOrErr.second)
reportBufferError(errorCodeToError(mbOrErr.second), childName);
// Pass empty string as archive name so that the original filename is
// used as the buffer identifier.
toCOFFString(sym), "", /*OffsetInArchive=*/0);
static bool isDecorated(StringRef sym) {
return sym.startswith("@") || sym.contains("@@") || sym.startswith("?") ||
(!config->mingw && sym.contains('@'));
// Parses .drectve section contents and returns a list of files
// specified by /defaultlib.
void LinkerDriver::parseDirectives(InputFile *file) {
StringRef s = file->getDirectives();
if (s.empty())
log("Directives: " + toString(file) + ": " + s);
ArgParser parser;
// .drectve is always tokenized using Windows shell rules.
// /EXPORT: option can appear too many times, processing in fastpath.
ParsedDirectives directives = parser.parseDirectives(s);
for (StringRef e : directives.exports) {
// If a common header file contains dllexported function
// declarations, many object files may end up with having the
// same /EXPORT options. In order to save cost of parsing them,
// we dedup them first.
if (!directivesExports.insert(e).second)
Export exp = parseExport(e);
if (config->machine == I386 && config->mingw) {
if (!isDecorated( ="_" +;
if (!exp.extName.empty() && !isDecorated(exp.extName))
exp.extName ="_" + exp.extName);
exp.directives = true;
// Handle /include: in bulk.
for (StringRef inc : directives.includes)
for (auto *arg : directives.args) {
switch (arg->getOption().getID()) {
case OPT_aligncomm:
case OPT_alternatename:
case OPT_defaultlib:
if (Optional<StringRef> path = findLib(arg->getValue()))
enqueuePath(*path, false, false);
case OPT_entry:
config->entry = addUndefined(mangle(arg->getValue()));
case OPT_failifmismatch:
checkFailIfMismatch(arg->getValue(), file);
case OPT_incl:
case OPT_merge:
case OPT_nodefaultlib:
case OPT_section:
case OPT_subsystem:
parseSubsystem(arg->getValue(), &config->subsystem,
&config->majorOSVersion, &config->minorOSVersion);
// Only add flags here that link.exe accepts in
// `#pragma comment(linker, "/flag")`-generated sections.
case OPT_editandcontinue:
case OPT_guardsym:
case OPT_throwingnew:
error(arg->getSpelling() + " is not allowed in .drectve");
// Find file from search paths. You can omit ".obj", this function takes
// care of that. Note that the returned path is not guaranteed to exist.
StringRef LinkerDriver::doFindFile(StringRef filename) {
bool hasPathSep = (filename.find_first_of("/\\") != StringRef::npos);
if (hasPathSep)
return filename;
bool hasExt = filename.contains('.');
for (StringRef dir : searchPaths) {
SmallString<128> path = dir;
sys::path::append(path, filename);
if (sys::fs::exists(path.str()))
if (!hasExt) {
if (sys::fs::exists(path.str()))
return filename;
static Optional<sys::fs::UniqueID> getUniqueID(StringRef path) {
sys::fs::UniqueID ret;
if (sys::fs::getUniqueID(path, ret))
return None;
return ret;
// Resolves a file path. This never returns the same path
// (in that case, it returns None).
Optional<StringRef> LinkerDriver::findFile(StringRef filename) {
StringRef path = doFindFile(filename);
if (Optional<sys::fs::UniqueID> id = getUniqueID(path)) {
bool seen = !visitedFiles.insert(*id).second;
if (seen)
return None;
if (path.endswith_lower(".lib"))
return path;
// MinGW specific. If an embedded directive specified to link to
// foo.lib, but it isn't found, try libfoo.a instead.
StringRef LinkerDriver::doFindLibMinGW(StringRef filename) {
if (filename.contains('/') || filename.contains('\\'))
return filename;
SmallString<128> s = filename;
sys::path::replace_extension(s, ".a");
StringRef libName ="lib" + s.str());
return doFindFile(libName);
// Find library file from search path.
StringRef LinkerDriver::doFindLib(StringRef filename) {
// Add ".lib" to Filename if that has no file extension.
bool hasExt = filename.contains('.');
if (!hasExt)
filename = + ".lib");
StringRef ret = doFindFile(filename);
// For MinGW, if the find above didn't turn up anything, try
// looking for a MinGW formatted library name.
if (config->mingw && ret == filename)
return doFindLibMinGW(filename);
return ret;
// Resolves a library path. /nodefaultlib options are taken into
// consideration. This never returns the same path (in that case,
// it returns None).
Optional<StringRef> LinkerDriver::findLib(StringRef filename) {
if (config->noDefaultLibAll)
return None;
if (!visitedLibs.insert(filename.lower()).second)
return None;
StringRef path = doFindLib(filename);
if (config->noDefaultLibs.count(path.lower()))
return None;
if (Optional<sys::fs::UniqueID> id = getUniqueID(path))
if (!visitedFiles.insert(*id).second)
return None;
return path;
// Parses LIB environment which contains a list of search paths.
void LinkerDriver::addLibSearchPaths() {
Optional<std::string> envOpt = Process::GetEnv("LIB");
if (!envOpt.hasValue())
StringRef env =*envOpt);
while (!env.empty()) {
StringRef path;
std::tie(path, env) = env.split(';');
Symbol *LinkerDriver::addUndefined(StringRef name) {
Symbol *b = symtab->addUndefined(name);
if (!b->isGCRoot) {
b->isGCRoot = true;
return b;
StringRef LinkerDriver::mangleMaybe(Symbol *s) {
// If the plain symbol name has already been resolved, do nothing.
Undefined *unmangled = dyn_cast<Undefined>(s);
if (!unmangled)
return "";
// Otherwise, see if a similar, mangled symbol exists in the symbol table.
Symbol *mangled = symtab->findMangle(unmangled->getName());
if (!mangled)
return "";
// If we find a similar mangled symbol, make this an alias to it and return
// its name.
log(unmangled->getName() + " aliased to " + mangled->getName());
unmangled->weakAlias = symtab->addUndefined(mangled->getName());
return mangled->getName();
// Windows specific -- find default entry point name.
// There are four different entry point functions for Windows executables,
// each of which corresponds to a user-defined "main" function. This function
// infers an entry point from a user-defined "main" function.
StringRef LinkerDriver::findDefaultEntry() {
assert(config->subsystem != IMAGE_SUBSYSTEM_UNKNOWN &&
"must handle /subsystem before calling this");
if (config->mingw)
return mangle(config->subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI
? "WinMainCRTStartup"
: "mainCRTStartup");
if (config->subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI) {
if (findUnderscoreMangle("wWinMain")) {
if (!findUnderscoreMangle("WinMain"))
return mangle("wWinMainCRTStartup");
warn("found both wWinMain and WinMain; using latter");
return mangle("WinMainCRTStartup");
if (findUnderscoreMangle("wmain")) {
if (!findUnderscoreMangle("main"))
return mangle("wmainCRTStartup");
warn("found both wmain and main; using latter");
return mangle("mainCRTStartup");
WindowsSubsystem LinkerDriver::inferSubsystem() {
if (config->dll)
if (config->mingw)
// Note that link.exe infers the subsystem from the presence of these
// functions even if /entry: or /nodefaultlib are passed which causes them
// to not be called.
bool haveMain = findUnderscoreMangle("main");
bool haveWMain = findUnderscoreMangle("wmain");
bool haveWinMain = findUnderscoreMangle("WinMain");
bool haveWWinMain = findUnderscoreMangle("wWinMain");
if (haveMain || haveWMain) {
if (haveWinMain || haveWWinMain) {
warn(std::string("found ") + (haveMain ? "main" : "wmain") + " and " +
(haveWinMain ? "WinMain" : "wWinMain") +
"; defaulting to /subsystem:console");
if (haveWinMain || haveWWinMain)
static uint64_t getDefaultImageBase() {
if (config->is64())
return config->dll ? 0x180000000 : 0x140000000;
return config->dll ? 0x10000000 : 0x400000;
static std::string createResponseFile(const opt::InputArgList &args,
ArrayRef<StringRef> filePaths,
ArrayRef<StringRef> searchPaths) {
SmallString<0> data;
raw_svector_ostream os(data);
for (auto *arg : args) {
switch (arg->getOption().getID()) {
case OPT_linkrepro:
case OPT_reproduce:
case OPT_defaultlib:
case OPT_libpath:
case OPT_manifest:
case OPT_manifest_colon:
case OPT_manifestdependency:
case OPT_manifestfile:
case OPT_manifestinput:
case OPT_manifestuac:
case OPT_implib:
case OPT_pdb:
case OPT_pdbstripped:
case OPT_out:
os << arg->getSpelling() << sys::path::filename(arg->getValue()) << "\n";
os << toString(*arg) << "\n";
for (StringRef path : searchPaths) {
std::string relPath = relativeToRoot(path);
os << "/libpath:" << quote(relPath) << "\n";
for (StringRef path : filePaths)
os << quote(relativeToRoot(path)) << "\n";
return std::string(data.str());
enum class DebugKind { Unknown, None, Full, FastLink, GHash, Dwarf, Symtab };
static DebugKind parseDebugKind(const opt::InputArgList &args) {
auto *a = args.getLastArg(OPT_debug, OPT_debug_opt);
if (!a)
return DebugKind::None;
if (a->getNumValues() == 0)
return DebugKind::Full;
DebugKind debug = StringSwitch<DebugKind>(a->getValue())
.CaseLower("none", DebugKind::None)
.CaseLower("full", DebugKind::Full)
.CaseLower("fastlink", DebugKind::FastLink)
// LLD extensions
.CaseLower("ghash", DebugKind::GHash)
.CaseLower("dwarf", DebugKind::Dwarf)
.CaseLower("symtab", DebugKind::Symtab)
if (debug == DebugKind::FastLink) {
warn("/debug:fastlink unsupported; using /debug:full");
return DebugKind::Full;
if (debug == DebugKind::Unknown) {
error("/debug: unknown option: " + Twine(a->getValue()));
return DebugKind::None;
return debug;
static unsigned parseDebugTypes(const opt::InputArgList &args) {
unsigned debugTypes = static_cast<unsigned>(DebugType::None);
if (auto *a = args.getLastArg(OPT_debugtype)) {
SmallVector<StringRef, 3> types;
.split(types, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
for (StringRef type : types) {
unsigned v = StringSwitch<unsigned>(type.lower())
.Case("cv", static_cast<unsigned>(DebugType::CV))
.Case("pdata", static_cast<unsigned>(DebugType::PData))
.Case("fixup", static_cast<unsigned>(DebugType::Fixup))
if (v == 0) {
warn("/debugtype: unknown option '" + type + "'");
debugTypes |= v;
return debugTypes;
// Default debug types
debugTypes = static_cast<unsigned>(DebugType::CV);
if (args.hasArg(OPT_driver))
debugTypes |= static_cast<unsigned>(DebugType::PData);
if (args.hasArg(OPT_profile))
debugTypes |= static_cast<unsigned>(DebugType::Fixup);
return debugTypes;
static std::string getMapFile(const opt::InputArgList &args,
opt::OptSpecifier os, opt::OptSpecifier osFile) {
auto *arg = args.getLastArg(os, osFile);
if (!arg)
return "";
if (arg->getOption().getID() == osFile.getID())
return arg->getValue();
assert(arg->getOption().getID() == os.getID());
StringRef outFile = config->outputFile;
return (outFile.substr(0, outFile.rfind('.')) + ".map").str();
static std::string getImplibPath() {
if (!config->implib.empty())
return std::string(config->implib);
SmallString<128> out = StringRef(config->outputFile);
sys::path::replace_extension(out, ".lib");
return std::string(out.str());
// The import name is calculated as follows:
// | LIBRARY w/ ext | LIBRARY w/o ext | no LIBRARY
// -----+----------------+---------------------+------------------
// LINK | {value} | {value}.{.dll/.exe} | {output name}
// LIB | {value} | {value}.dll | {output name}.dll
static std::string getImportName(bool asLib) {
SmallString<128> out;
if (config->importName.empty()) {
if (asLib)
sys::path::replace_extension(out, ".dll");
} else {
if (!sys::path::has_extension(out))
(config->dll || asLib) ? ".dll" : ".exe");
return std::string(out.str());
static void createImportLibrary(bool asLib) {
std::vector<COFFShortExport> exports;
for (Export &e1 : config->exports) {
COFFShortExport e2;
e2.Name = std::string(;
e2.SymbolName = std::string(e1.symbolName);
e2.ExtName = std::string(e1.extName);
e2.Ordinal = e1.ordinal;
e2.Noname = e1.noname;
e2.Data =;
e2.Private = e1.isPrivate;
e2.Constant = e1.constant;
auto handleError = [](Error &&e) {
[](ErrorInfoBase &eib) { error(eib.message()); });
std::string libName = getImportName(asLib);
std::string path = getImplibPath();
if (!config->incremental) {
handleError(writeImportLibrary(libName, path, exports, config->machine,
// If the import library already exists, replace it only if the contents
// have changed.
ErrorOr<std::unique_ptr<MemoryBuffer>> oldBuf = MemoryBuffer::getFile(
path, /*FileSize*/ -1, /*RequiresNullTerminator*/ false);
if (!oldBuf) {
handleError(writeImportLibrary(libName, path, exports, config->machine,
SmallString<128> tmpName;
if (std::error_code ec =
sys::fs::createUniqueFile(path + ".tmp-%%%%%%%%.lib", tmpName))
fatal("cannot create temporary file for import library " + path + ": " +
if (Error e = writeImportLibrary(libName, tmpName, exports, config->machine,
config->mingw)) {
std::unique_ptr<MemoryBuffer> newBuf = check(MemoryBuffer::getFile(
tmpName, /*FileSize*/ -1, /*RequiresNullTerminator*/ false));
if ((*oldBuf)->getBuffer() != newBuf->getBuffer()) {
handleError(errorCodeToError(sys::fs::rename(tmpName, path)));
} else {
static void parseModuleDefs(StringRef path) {
std::unique_ptr<MemoryBuffer> mb = CHECK(
MemoryBuffer::getFile(path, -1, false, true), "could not open " + path);
COFFModuleDefinition m = check(parseCOFFModuleDefinition(
mb->getMemBufferRef(), config->machine, config->mingw));
if (config->outputFile.empty())
config->outputFile = std::string(;
config->importName = std::string(;
if (m.ImageBase)
config->imageBase = m.ImageBase;
if (m.StackReserve)
config->stackReserve = m.StackReserve;
if (m.StackCommit)
config->stackCommit = m.StackCommit;
if (m.HeapReserve)
config->heapReserve = m.HeapReserve;
if (m.HeapCommit)
config->heapCommit = m.HeapCommit;
if (m.MajorImageVersion)
config->majorImageVersion = m.MajorImageVersion;
if (m.MinorImageVersion)
config->minorImageVersion = m.MinorImageVersion;
if (m.MajorOSVersion)
config->majorOSVersion = m.MajorOSVersion;
if (m.MinorOSVersion)
config->minorOSVersion = m.MinorOSVersion;
for (COFFShortExport e1 : m.Exports) {
Export e2;
// In simple cases, only Name is set. Renamed exports are parsed
// and set as "ExtName = Name". If Name has the form "OtherDll.Func",
// it shouldn't be a normal exported function but a forward to another
// DLL instead. This is supported by both MS and GNU linkers.
if (!e1.ExtName.empty() && e1.ExtName != e1.Name &&
StringRef(e1.Name).contains('.')) { =;
e2.forwardTo =;
} =;
e2.extName =;
e2.ordinal = e1.Ordinal;
e2.noname = e1.Noname; = e1.Data;
e2.isPrivate = e1.Private;
e2.constant = e1.Constant;
void LinkerDriver::enqueueTask(std::function<void()> task) {
bool LinkerDriver::run() {
ScopedTimer t(inputFileTimer);
bool didWork = !taskQueue.empty();
while (!taskQueue.empty()) {
return didWork;
// Parse an /order file. If an option is given, the linker places
// COMDAT sections in the same order as their names appear in the
// given file.
static void parseOrderFile(StringRef arg) {
// For some reason, the MSVC linker requires a filename to be
// preceded by "@".
if (!arg.startswith("@")) {
error("malformed /order option: '@' missing");
// Get a list of all comdat sections for error checking.
DenseSet<StringRef> set;
for (Chunk *c : symtab->getChunks())
if (auto *sec = dyn_cast<SectionChunk>(c))
if (sec->sym)
// Open a file.
StringRef path = arg.substr(1);
std::unique_ptr<MemoryBuffer> mb = CHECK(
MemoryBuffer::getFile(path, -1, false, true), "could not open " + path);
// Parse a file. An order file contains one symbol per line.
// All symbols that were not present in a given order file are
// considered to have the lowest priority 0 and are placed at
// end of an output section.
for (StringRef arg : args::getLines(mb->getMemBufferRef())) {
std::string s(arg);
if (config->machine == I386 && !isDecorated(s))
s = "_" + s;
if (set.count(s) == 0) {
if (config->warnMissingOrderSymbol)
warn("/order:" + arg + ": missing symbol: " + s + " [LNK4037]");
config->order[s] = INT_MIN + config->order.size();
static void markAddrsig(Symbol *s) {
if (auto *d = dyn_cast_or_null<Defined>(s))
if (SectionChunk *c = dyn_cast_or_null<SectionChunk>(d->getChunk()))
c->keepUnique = true;
static void findKeepUniqueSections() {
// Exported symbols could be address-significant in other executables or DSOs,
// so we conservatively mark them as address-significant.
for (Export &r : config->exports)
// Visit the address-significance table in each object file and mark each
// referenced symbol as address-significant.
for (ObjFile *obj : ObjFile::instances) {
ArrayRef<Symbol *> syms = obj->getSymbols();
if (obj->addrsigSec) {
ArrayRef<uint8_t> contents;
obj->getCOFFObj()->getSectionContents(obj->addrsigSec, contents));
const uint8_t *cur = contents.begin();
while (cur != contents.end()) {
unsigned size;
const char *err;
uint64_t symIndex = decodeULEB128(cur, &size, contents.end(), &err);
if (err)
fatal(toString(obj) + ": could not decode addrsig section: " + err);
if (symIndex >= syms.size())
fatal(toString(obj) + ": invalid symbol index in addrsig section");
cur += size;
} else {
// If an object file does not have an address-significance table,
// conservatively mark all of its symbols as address-significant.
for (Symbol *s : syms)
// link.exe replaces each %foo% in altPath with the contents of environment
// variable foo, and adds the two magic env vars _PDB (expands to the basename
// of pdb's output path) and _EXT (expands to the extension of the output
// binary).
// lld only supports %_PDB% and %_EXT% and warns on references to all other env
// vars.
static void parsePDBAltPath(StringRef altPath) {
SmallString<128> buf;
StringRef pdbBasename =
sys::path::filename(config->pdbPath, sys::path::Style::windows);
StringRef binaryExtension =
sys::path::extension(config->outputFile, sys::path::Style::windows);
if (!binaryExtension.empty())
binaryExtension = binaryExtension.substr(1); // %_EXT% does not include '.'.
// Invariant:
// +--------- cursor ('a...' might be the empty string).
// | +----- firstMark
// | | +- secondMark
// v v v
// a...%...%...
size_t cursor = 0;
while (cursor < altPath.size()) {
size_t firstMark, secondMark;
if ((firstMark = altPath.find('%', cursor)) == StringRef::npos ||
(secondMark = altPath.find('%', firstMark + 1)) == StringRef::npos) {
// Didn't find another full fragment, treat rest of string as literal.
// Found a full fragment. Append text in front of first %, and interpret
// text between first and second % as variable name.
buf.append(altPath.substr(cursor, firstMark - cursor));
StringRef var = altPath.substr(firstMark, secondMark - firstMark + 1);
if (var.equals_lower("%_pdb%"))
else if (var.equals_lower("%_ext%"))
else {
warn("only %_PDB% and %_EXT% supported in /pdbaltpath:, keeping " +
var + " as literal");
cursor = secondMark + 1;
config->pdbAltPath = buf;
/// Convert resource files and potentially merge input resource object
/// trees into one resource tree.
/// Call after ObjFile::Instances is complete.
void LinkerDriver::convertResources() {
std::vector<ObjFile *> resourceObjFiles;
for (ObjFile *f : ObjFile::instances) {
if (f->isResourceObjFile())
if (!config->mingw &&
(resourceObjFiles.size() > 1 ||
(resourceObjFiles.size() == 1 && !resources.empty()))) {
error((!resources.empty() ? "internal .obj file created from .res files"
: toString(resourceObjFiles[1])) +
": more than one resource obj file not allowed, already got " +
if (resources.empty() && resourceObjFiles.size() <= 1) {
// No resources to convert, and max one resource object file in
// the input. Keep that preconverted resource section as is.
for (ObjFile *f : resourceObjFiles)
ObjFile *f = make<ObjFile>(convertResToCOFF(resources, resourceObjFiles));
// In MinGW, if no symbols are chosen to be exported, then all symbols are
// automatically exported by default. This behavior can be forced by the
// -export-all-symbols option, so that it happens even when exports are
// explicitly specified. The automatic behavior can be disabled using the
// -exclude-all-symbols option, so that lld-link behaves like link.exe rather
// than MinGW in the case that nothing is explicitly exported.
void LinkerDriver::maybeExportMinGWSymbols(const opt::InputArgList &args) {
if (!config->dll)
if (!args.hasArg(OPT_export_all_symbols)) {
if (!config->exports.empty())
if (args.hasArg(OPT_exclude_all_symbols))
AutoExporter exporter;
for (auto *arg : args.filtered(OPT_wholearchive_file))
if (Optional<StringRef> path = doFindFile(arg->getValue()))
symtab->forEachSymbol([&](Symbol *s) {
auto *def = dyn_cast<Defined>(s);
if (!exporter.shouldExport(def))
Export e; = def->getName();
e.sym = def;
if (Chunk *c = def->getChunk())
if (!(c->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE)) = true;
// lld has a feature to create a tar file containing all input files as well as
// all command line options, so that other people can run lld again with exactly
// the same inputs. This feature is accessible via /linkrepro and /reproduce.
// /linkrepro and /reproduce are very similar, but /linkrepro takes a directory
// name while /reproduce takes a full path. We have /linkrepro for compatibility
// with Microsoft link.exe.
Optional<std::string> getReproduceFile(const opt::InputArgList &args) {
if (auto *arg = args.getLastArg(OPT_reproduce))
return std::string(arg->getValue());
if (auto *arg = args.getLastArg(OPT_linkrepro)) {
SmallString<64> path = StringRef(arg->getValue());
sys::path::append(path, "repro.tar");
return std::string(path);
return None;
void LinkerDriver::link(ArrayRef<const char *> argsArr) {
ScopedTimer rootTimer(Timer::root());
// Needed for LTO.
// If the first command line argument is "/lib", link.exe acts like lib.exe.
// We call our own implementation of lib.exe that understands bitcode files.
if (argsArr.size() > 1 && StringRef(argsArr[1]).equals_lower("/lib")) {
if (llvm::libDriverMain(argsArr.slice(1)) != 0)
fatal("lib failed");
// Parse command line options.
ArgParser parser;
opt::InputArgList args = parser.parse(argsArr);
// Parse and evaluate -mllvm options.
std::vector<const char *> v;
v.push_back("lld-link (LLVM option parsing)");
for (auto *arg : args.filtered(OPT_mllvm))
// Handle /errorlimit early, because error() depends on it.
if (auto *arg = args.getLastArg(OPT_errorlimit)) {
int n = 20;
StringRef s = arg->getValue();
if (s.getAsInteger(10, n))
error(arg->getSpelling() + " number expected, but got " + s);
errorHandler().errorLimit = n;
// Handle /help
if (args.hasArg(OPT_help)) {
// /threads: takes a positive integer and provides the default value for
// /opt:lldltojobs=.
if (auto *arg = args.getLastArg(OPT_threads)) {
StringRef v(arg->getValue());
unsigned threads = 0;
if (!llvm::to_integer(v, threads, 0) || threads == 0)
error(arg->getSpelling() + ": expected a positive integer, but got '" +
arg->getValue() + "'");
parallel::strategy = hardware_concurrency(threads);
config->thinLTOJobs = v.str();
if (args.hasArg(OPT_show_timing))
config->showTiming = true;
config->showSummary = args.hasArg(OPT_summary);
// Handle --version, which is an lld extension. This option is a bit odd
// because it doesn't start with "/", but we deliberately chose "--" to
// avoid conflict with /version and for compatibility with clang-cl.
if (args.hasArg(OPT_dash_dash_version)) {
lld::outs() << getLLDVersion() << "\n";
// Handle /lldmingw early, since it can potentially affect how other
// options are handled.
config->mingw = args.hasArg(OPT_lldmingw);
// Handle /linkrepro and /reproduce.
if (Optional<std::string> path = getReproduceFile(args)) {
Expected<std::unique_ptr<TarWriter>> errOrWriter =
TarWriter::create(*path, sys::path::stem(*path));
if (errOrWriter) {
tar = std::move(*errOrWriter);
} else {
error("/linkrepro: failed to open " + *path + ": " +
if (!args.hasArg(OPT_INPUT, OPT_wholearchive_file)) {
if (args.hasArg(OPT_deffile))
config->noEntry = true;
fatal("no input files");
// Construct search path list.
for (auto *arg : args.filtered(OPT_libpath))
if (!args.hasArg(OPT_lldignoreenv))
// Handle /ignore
for (auto *arg : args.filtered(OPT_ignore)) {
SmallVector<StringRef, 8> vec;
StringRef(arg->getValue()).split(vec, ',');
for (StringRef s : vec) {
if (s == "4037")
config->warnMissingOrderSymbol = false;
else if (s == "4099")
config->warnDebugInfoUnusable = false;
else if (s == "4217")
config->warnLocallyDefinedImported = false;
else if (s == "longsections")
config->warnLongSectionNames = false;
// Other warning numbers are ignored.
// Handle /out
if (auto *arg = args.getLastArg(OPT_out))
config->outputFile = arg->getValue();
// Handle /verbose
if (args.hasArg(OPT_verbose))
config->verbose = true;
errorHandler().verbose = config->verbose;
// Handle /force or /force:unresolved
if (args.hasArg(OPT_force, OPT_force_unresolved))
config->forceUnresolved = true;
// Handle /force or /force:multiple
if (args.hasArg(OPT_force, OPT_force_multiple))
config->forceMultiple = true;
// Handle /force or /force:multipleres
if (args.hasArg(OPT_force, OPT_force_multipleres))
config->forceMultipleRes = true;
// Handle /debug
DebugKind debug = parseDebugKind(args);
if (debug == DebugKind::Full || debug == DebugKind::Dwarf ||
debug == DebugKind::GHash) {
config->debug = true;
config->incremental = true;
// Handle /demangle
config->demangle = args.hasFlag(OPT_demangle, OPT_demangle_no);
// Handle /debugtype
config->debugTypes = parseDebugTypes(args);
// Handle /driver[:uponly|:wdm].
config->driverUponly = args.hasArg(OPT_driver_uponly) ||
args.hasArg(OPT_driver_uponly_wdm) ||
config->driverWdm = args.hasArg(OPT_driver_wdm) ||
args.hasArg(OPT_driver_uponly_wdm) ||
config->driver =
config->driverUponly || config->driverWdm || args.hasArg(OPT_driver);
// Handle /pdb
bool shouldCreatePDB =
(debug == DebugKind::Full || debug == DebugKind::GHash);
if (shouldCreatePDB) {
if (auto *arg = args.getLastArg(OPT_pdb))
config->pdbPath = arg->getValue();
if (auto *arg = args.getLastArg(OPT_pdbaltpath))
config->pdbAltPath = arg->getValue();
if (args.hasArg(OPT_natvis))
config->natvisFiles = args.getAllArgValues(OPT_natvis);
if (args.hasArg(OPT_pdbstream)) {
for (const StringRef value : args.getAllArgValues(OPT_pdbstream)) {
const std::pair<StringRef, StringRef> nameFile = value.split("=");
const StringRef name = nameFile.first;
const std::string file = nameFile.second.str();
config->namedStreams[name] = file;
if (auto *arg = args.getLastArg(OPT_pdb_source_path))
config->pdbSourcePath = arg->getValue();
// Handle /pdbstripped
if (args.hasArg(OPT_pdbstripped))
warn("ignoring /pdbstripped flag, it is not yet supported");
// Handle /noentry
if (args.hasArg(OPT_noentry)) {
if (args.hasArg(OPT_dll))
config->noEntry = true;
error("/noentry must be specified with /dll");
// Handle /dll
if (args.hasArg(OPT_dll)) {
config->dll = true;
config->manifestID = 2;
// Handle /dynamicbase and /fixed. We can't use hasFlag for /dynamicbase
// because we need to explicitly check whether that option or its inverse was
// present in the argument list in order to handle /fixed.
auto *dynamicBaseArg = args.getLastArg(OPT_dynamicbase, OPT_dynamicbase_no);
if (dynamicBaseArg &&
dynamicBaseArg->getOption().getID() == OPT_dynamicbase_no)
config->dynamicBase = false;
// MSDN claims "/FIXED:NO is the default setting for a DLL, and /FIXED is the
// default setting for any other project type.", but link.exe defaults to
// /FIXED:NO for exe outputs as well. Match behavior, not docs.
bool fixed = args.hasFlag(OPT_fixed, OPT_fixed_no, false);
if (fixed) {
if (dynamicBaseArg &&
dynamicBaseArg->getOption().getID() == OPT_dynamicbase) {
error("/fixed must not be specified with /dynamicbase");
} else {
config->relocatable = false;
config->dynamicBase = false;
// Handle /appcontainer
config->appContainer =
args.hasFlag(OPT_appcontainer, OPT_appcontainer_no, false);
// Handle /machine
if (auto *arg = args.getLastArg(OPT_machine)) {
config->machine = getMachineType(arg->getValue());
if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN)
fatal(Twine("unknown /machine argument: ") + arg->getValue());
// Handle /nodefaultlib:<filename>
for (auto *arg : args.filtered(OPT_nodefaultlib))
// Handle /nodefaultlib
if (args.hasArg(OPT_nodefaultlib_all))
config->noDefaultLibAll = true;
// Handle /base
if (auto *arg = args.getLastArg(OPT_base))
parseNumbers(arg->getValue(), &config->imageBase);
// Handle /filealign
if (auto *arg = args.getLastArg(OPT_filealign)) {
parseNumbers(arg->getValue(), &config->fileAlign);
if (!isPowerOf2_64(config->fileAlign))
error("/filealign: not a power of two: " + Twine(config->fileAlign));
// Handle /stack
if (auto *arg = args.getLastArg(OPT_stack))
parseNumbers(arg->getValue(), &config->stackReserve, &config->stackCommit);
// Handle /guard:cf
if (auto *arg = args.getLastArg(OPT_guard))
// Handle /heap
if (auto *arg = args.getLastArg(OPT_heap))
parseNumbers(arg->getValue(), &config->heapReserve, &config->heapCommit);
// Handle /version
if (auto *arg = args.getLastArg(OPT_version))
parseVersion(arg->getValue(), &config->majorImageVersion,
// Handle /subsystem
if (auto *arg = args.getLastArg(OPT_subsystem))
parseSubsystem(arg->getValue(), &config->subsystem, &config->majorOSVersion,
// Handle /timestamp
if (llvm::opt::Arg *arg = args.getLastArg(OPT_timestamp, OPT_repro)) {
if (arg->getOption().getID() == OPT_repro) {
config->timestamp = 0;
config->repro = true;
} else {
config->repro = false;
StringRef value(arg->getValue());
if (value.getAsInteger(0, config->timestamp))
fatal(Twine("invalid timestamp: ") + value +
". Expected 32-bit integer");
} else {
config->repro = false;
config->timestamp = time(nullptr);
// Handle /alternatename
for (auto *arg : args.filtered(OPT_alternatename))
// Handle /include
for (auto *arg : args.filtered(OPT_incl))
// Handle /implib
if (auto *arg = args.getLastArg(OPT_implib))
config->implib = arg->getValue();
// Handle /opt.
bool doGC = debug == DebugKind::None || args.hasArg(OPT_profile);
unsigned icfLevel =
args.hasArg(OPT_profile) ? 0 : 1; // 0: off, 1: limited, 2: on
unsigned tailMerge = 1;
for (auto *arg : args.filtered(OPT_opt)) {
std::string str = StringRef(arg->getValue()).lower();
SmallVector<StringRef, 1> vec;
StringRef(str).split(vec, ',');
for (StringRef s : vec) {
if (s == "ref") {
doGC = true;
} else if (s == "noref") {
doGC = false;
} else if (s == "icf" || s.startswith("icf=")) {
icfLevel = 2;
} else if (s == "noicf") {
icfLevel = 0;
} else if (s == "lldtailmerge") {
tailMerge = 2;
} else if (s == "nolldtailmerge") {
tailMerge = 0;
} else if (s.startswith("lldlto=")) {
StringRef optLevel = s.substr(7);
if (optLevel.getAsInteger(10, config->ltoo) || config->ltoo > 3)
error("/opt:lldlto: invalid optimization level: " + optLevel);
} else if (s.startswith("lldltojobs=")) {
StringRef jobs = s.substr(11);
if (!get_threadpool_strategy(jobs))
error("/opt:lldltojobs: invalid job count: " + jobs);
config->thinLTOJobs = jobs.str();
} else if (s.startswith("lldltopartitions=")) {
StringRef n = s.substr(17);
if (n.getAsInteger(10, config->ltoPartitions) ||
config->ltoPartitions == 0)
error("/opt:lldltopartitions: invalid partition count: " + n);
} else if (s != "lbr" && s != "nolbr")
error("/opt: unknown option: " + s);
// Limited ICF is enabled if GC is enabled and ICF was never mentioned
// explicitly.
// FIXME: LLD only implements "limited" ICF, i.e. it only merges identical
// code. If the user passes /OPT:ICF explicitly, LLD should merge identical
// comdat readonly data.
if (icfLevel == 1 && !doGC)
icfLevel = 0;
config->doGC = doGC;
config->doICF = icfLevel > 0;
config->tailMerge = (tailMerge == 1 && config->doICF) || tailMerge == 2;
// Handle /lldsavetemps
if (args.hasArg(OPT_lldsavetemps))
config->saveTemps = true;
// Handle /kill-at
if (args.hasArg(OPT_kill_at))
config->killAt = true;
// Handle /lldltocache
if (auto *arg = args.getLastArg(OPT_lldltocache))
config->ltoCache = arg->getValue();
// Handle /lldsavecachepolicy
if (auto *arg = args.getLastArg(OPT_lldltocachepolicy))
config->ltoCachePolicy = CHECK(
Twine("/lldltocachepolicy: invalid cache policy: ") + arg->getValue());
// Handle /failifmismatch
for (auto *arg : args.filtered(OPT_failifmismatch))
checkFailIfMismatch(arg->getValue(), nullptr);
// Handle /merge
for (auto *arg : args.filtered(OPT_merge))
// Add default section merging rules after user rules. User rules take
// precedence, but we will emit a warning if there is a conflict.
if (config->mingw) {
// Handle /section
for (auto *arg : args.filtered(OPT_section))
// Handle /align
if (auto *arg = args.getLastArg(OPT_align)) {
parseNumbers(arg->getValue(), &config->align);
if (!isPowerOf2_64(config->align))
error("/align: not a power of two: " + StringRef(arg->getValue()));
if (!args.hasArg(OPT_driver))
warn("/align specified without /driver; image may not run");
// Handle /aligncomm
for (auto *arg : args.filtered(OPT_aligncomm))
// Handle /manifestdependency. This enables /manifest unless /manifest:no is
// also passed.
if (auto *arg = args.getLastArg(OPT_manifestdependency)) {
config->manifestDependency = arg->getValue();
config->manifest = Configuration::SideBySide;
// Handle /manifest and /manifest:
if (auto *arg = args.getLastArg(OPT_manifest, OPT_manifest_colon)) {
if (arg->getOption().getID() == OPT_manifest)
config->manifest = Configuration::SideBySide;
// Handle /manifestuac
if (auto *arg = args.getLastArg(OPT_manifestuac))
// Handle /manifestfile
if (auto *arg = args.getLastArg(OPT_manifestfile))
config->manifestFile = arg->getValue();
// Handle /manifestinput
for (auto *arg : args.filtered(OPT_manifestinput))
if (!config->manifestInput.empty() &&
config->manifest != Configuration::Embed) {
fatal("/manifestinput: requires /manifest:embed");
config->thinLTOEmitImportsFiles = args.hasArg(OPT_thinlto_emit_imports_files);
config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) ||
config->thinLTOIndexOnlyArg =
config->thinLTOPrefixReplace =
getOldNewOptions(args, OPT_thinlto_prefix_replace);
config->thinLTOObjectSuffixReplace =
getOldNewOptions(args, OPT_thinlto_object_suffix_replace);
config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path);
// Handle miscellaneous boolean flags.
config->allowBind = args.hasFlag(OPT_allowbind, OPT_allowbind_no, true);
config->allowIsolation =
args.hasFlag(OPT_allowisolation, OPT_allowisolation_no, true);
config->incremental =
args.hasFlag(OPT_incremental, OPT_incremental_no,
!config->doGC && !config->doICF && !args.hasArg(OPT_order) &&
config->integrityCheck =
args.hasFlag(OPT_integritycheck, OPT_integritycheck_no, false);
config->cetCompat = args.hasFlag(OPT_cetcompat, OPT_cetcompat_no, false);
config->nxCompat = args.hasFlag(OPT_nxcompat, OPT_nxcompat_no, true);
for (auto *arg : args.filtered(OPT_swaprun))
config->terminalServerAware =
!config->dll && args.hasFlag(OPT_tsaware, OPT_tsaware_no, true);
config->debugDwarf = debug == DebugKind::Dwarf;
config->debugGHashes = debug == DebugKind::GHash;
config->debugSymtab = debug == DebugKind::Symtab;
config->autoImport =
args.hasFlag(OPT_auto_import, OPT_auto_import_no, config->mingw);
config->pseudoRelocs = args.hasFlag(
OPT_runtime_pseudo_reloc, OPT_runtime_pseudo_reloc_no, config->mingw);
// Don't warn about long section names, such as .debug_info, for mingw or when
// -debug:dwarf is requested.
if (config->mingw || config->debugDwarf)
config->warnLongSectionNames = false;
config->lldmapFile = getMapFile(args, OPT_lldmap, OPT_lldmap_file);
config->mapFile = getMapFile(args, OPT_map, OPT_map_file);
if (config->lldmapFile != "" && config->lldmapFile == config->mapFile) {
warn("/lldmap and /map have the same output file '" + config->mapFile +
"'.\n>>> ignoring /lldmap");
if (config->incremental && args.hasArg(OPT_profile)) {
warn("ignoring '/incremental' due to '/profile' specification");
config->incremental = false;
if (config->incremental && args.hasArg(OPT_order)) {
warn("ignoring '/incremental' due to '/order' specification");
config->incremental = false;
if (config->incremental && config->doGC) {
warn("ignoring '/incremental' because REF is enabled; use '/opt:noref' to "
config->incremental = false;
if (config->incremental && config->doICF) {
warn("ignoring '/incremental' because ICF is enabled; use '/opt:noicf' to "
config->incremental = false;
if (errorCount())
std::set<sys::fs::UniqueID> wholeArchives;
for (auto *arg : args.filtered(OPT_wholearchive_file))
if (Optional<StringRef> path = doFindFile(arg->getValue()))
if (Optional<sys::fs::UniqueID> id = getUniqueID(*path))
// A predicate returning true if a given path is an argument for
// /wholearchive:, or /wholearchive is enabled globally.
// This function is a bit tricky because "foo.obj /wholearchive:././foo.obj"
// needs to be handled as "/wholearchive:foo.obj foo.obj".
auto isWholeArchive = [&](StringRef path) -> bool {
if (args.hasArg(OPT_wholearchive_flag))
return true;
if (Optional<sys::fs::UniqueID> id = getUniqueID(path))
return wholeArchives.count(*id);
return false;
// Create a list of input files. These can be given as OPT_INPUT options
// and OPT_wholearchive_file options, and we also need to track OPT_start_lib
// and OPT_end_lib.
bool inLib = false;
for (auto *arg : args) {
switch (arg->getOption().getID()) {
case OPT_end_lib:
if (!inLib)
error("stray " + arg->getSpelling());
inLib = false;
case OPT_start_lib:
if (inLib)
error("nested " + arg->getSpelling());
inLib = true;
case OPT_wholearchive_file:
if (Optional<StringRef> path = findFile(arg->getValue()))
enqueuePath(*path, true, inLib);
if (Optional<StringRef> path = findFile(arg->getValue()))
enqueuePath(*path, isWholeArchive(*path), inLib);
// Ignore other options.
// Process files specified as /defaultlib. These should be enequeued after
// other files, which is why they are in a separate loop.
for (auto *arg : args.filtered(OPT_defaultlib))
if (Optional<StringRef> path = findLib(arg->getValue()))
enqueuePath(*path, false, false);
// Windows specific -- Create a resource file containing a manifest file.
if (config->manifest == Configuration::Embed)
addBuffer(createManifestRes(), false, false);
// Read all input files given via the command line.
if (errorCount())
// We should have inferred a machine type by now from the input files, but if
// not we assume x64.
if (config->machine == IMAGE_FILE_MACHINE_UNKNOWN) {
warn("/machine is not specified. x64 is assumed");
config->machine = AMD64;
config->wordsize = config->is64() ? 8 : 4;
// Handle /safeseh, x86 only, on by default, except for mingw.
- if (config->machine == I386 &&
- args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw))
- config->safeSEH = true;
+ if (config->machine == I386) {
+ config->safeSEH = args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw);
+ config->noSEH = args.hasArg(OPT_noseh);
+ }
// Handle /functionpadmin
for (auto *arg : args.filtered(OPT_functionpadmin, OPT_functionpadmin_opt))
parseFunctionPadMin(arg, config->machine);
if (tar)
createResponseFile(args, filePaths,
// Handle /largeaddressaware
config->largeAddressAware = args.hasFlag(
OPT_largeaddressaware, OPT_largeaddressaware_no, config->is64());
// Handle /highentropyva
config->highEntropyVA =
config->is64() &&
args.hasFlag(OPT_highentropyva, OPT_highentropyva_no, true);
if (!config->dynamicBase &&
(config->machine == ARMNT || config->machine == ARM64))
error("/dynamicbase:no is not compatible with " +
// Handle /export
for (auto *arg : args.filtered(OPT_export)) {
Export e = parseExport(arg->getValue());
if (config->machine == I386) {
if (!isDecorated( ="_" +;
if (!e.extName.empty() && !isDecorated(e.extName))
e.extName ="_" + e.extName);
// Handle /def
if (auto *arg = args.getLastArg(OPT_deffile)) {
// parseModuleDefs mutates Config object.
// Handle generation of import library from a def file.
if (!args.hasArg(OPT_INPUT, OPT_wholearchive_file)) {
// Windows specific -- if no /subsystem is given, we need to infer
// that from entry point name. Must happen before /entry handling,
// and after the early return when just writing an import library.
if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) {
config->subsystem = inferSubsystem();
if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN)
fatal("subsystem must be defined");
// Handle /entry and /dll
if (auto *arg = args.getLastArg(OPT_entry)) {
config->entry = addUndefined(mangle(arg->getValue()));
} else if (!config->entry && !config->noEntry) {
if (args.hasArg(OPT_dll)) {
StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12"
: "_DllMainCRTStartup";
config->entry = addUndefined(s);
} else if (config->driverWdm) {
// /driver:wdm implies /entry:_NtProcessStartup
config->entry = addUndefined(mangle("_NtProcessStartup"));
} else {
// Windows specific -- If entry point name is not given, we need to
// infer that from user-defined entry name.
StringRef s = findDefaultEntry();
if (s.empty())
fatal("entry point must be defined");
config->entry = addUndefined(s);
log("Entry name inferred: " + s);
// Handle /delayload
for (auto *arg : args.filtered(OPT_delayload)) {
if (config->machine == I386) {
config->delayLoadHelper = addUndefined("___delayLoadHelper2@8");
} else {
config->delayLoadHelper = addUndefined("__delayLoadHelper2");
// Set default image name if neither /out or /def set it.
if (config->outputFile.empty()) {
config->outputFile = getOutputPath(
(*args.filtered(OPT_INPUT, OPT_wholearchive_file).begin())->getValue());
// Fail early if an output file is not writable.
if (auto e = tryCreateFile(config->outputFile)) {
error("cannot open output file " + config->outputFile + ": " + e.message());
if (shouldCreatePDB) {
// Put the PDB next to the image if no /pdb flag was passed.
if (config->pdbPath.empty()) {
config->pdbPath = config->outputFile;
sys::path::replace_extension(config->pdbPath, ".pdb");
// The embedded PDB path should be the absolute path to the PDB if no
// /pdbaltpath flag was passed.
if (config->pdbAltPath.empty()) {
config->pdbAltPath = config->pdbPath;
// It's important to make the path absolute and remove dots. This path
// will eventually be written into the PE header, and certain Microsoft
// tools won't work correctly if these assumptions are not held.
} else {
// Don't do this earlier, so that Config->OutputFile is ready.
// Set default image base if /base is not given.
if (config->imageBase == uint64_t(-1))
config->imageBase = getDefaultImageBase();
symtab->addSynthetic(mangle("__ImageBase"), nullptr);
if (config->machine == I386) {
symtab->addAbsolute("___safe_se_handler_table", 0);
symtab->addAbsolute("___safe_se_handler_count", 0);
symtab->addAbsolute(mangle("__guard_fids_count"), 0);
symtab->addAbsolute(mangle("__guard_fids_table"), 0);
symtab->addAbsolute(mangle("__guard_flags"), 0);
symtab->addAbsolute(mangle("__guard_iat_count"), 0);
symtab->addAbsolute(mangle("__guard_iat_table"), 0);
symtab->addAbsolute(mangle("__guard_longjmp_count"), 0);
symtab->addAbsolute(mangle("__guard_longjmp_table"), 0);
// Needed for MSVC 2017 15.5 CRT.
symtab->addAbsolute(mangle("__enclave_config"), 0);
if (config->pseudoRelocs) {
symtab->addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST__"), 0);
symtab->addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST_END__"), 0);
if (config->mingw) {
symtab->addAbsolute(mangle("__CTOR_LIST__"), 0);
symtab->addAbsolute(mangle("__DTOR_LIST__"), 0);
// This code may add new undefined symbols to the link, which may enqueue more
// symbol resolution tasks, so we need to continue executing tasks until we
// converge.
do {
// Windows specific -- if entry point is not found,
// search for its mangled names.
if (config->entry)
// Windows specific -- Make sure we resolve all dllexported symbols.
for (Export &e : config->exports) {
if (!e.forwardTo.empty())
e.sym = addUndefined(;
if (!e.directives)
e.symbolName = mangleMaybe(e.sym);
// Add weak aliases. Weak aliases is a mechanism to give remaining
// undefined symbols final chance to be resolved successfully.
for (auto pair : config->alternateNames) {
StringRef from = pair.first;
StringRef to = pair.second;
Symbol *sym = symtab->find(from);
if (!sym)
if (auto *u = dyn_cast<Undefined>(sym))
if (!u->weakAlias)
u->weakAlias = symtab->addUndefined(to);
// If any inputs are bitcode files, the LTO code generator may create
// references to library functions that are not explicit in the bitcode
// file's symbol table. If any of those library functions are defined in a
// bitcode file in an archive member, we need to arrange to use LTO to
// compile those archive members by adding them to the link beforehand.
if (!BitcodeFile::instances.empty())
for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
// Windows specific -- if __load_config_used can be resolved, resolve it.
if (symtab->findUnderscore("_load_config_used"))
} while (run());
if (args.hasArg(OPT_include_optional)) {
// Handle /includeoptional
for (auto *arg : args.filtered(OPT_include_optional))
if (dyn_cast_or_null<LazyArchive>(symtab->find(arg->getValue())))
while (run());
if (config->autoImport) {
// MinGW specific.
// Load any further object files that might be needed for doing automatic
// imports.
// For cases with no automatically imported symbols, this iterates once
// over the symbol table and doesn't do anything.
// For the normal case with a few automatically imported symbols, this
// should only need to be run once, since each new object file imported
// is an import library and wouldn't add any new undefined references,
// but there's nothing stopping the __imp_ symbols from coming from a
// normal object file as well (although that won't be used for the
// actual autoimport later on). If this pass adds new undefined references,
// we won't iterate further to resolve them.
// At this point, we should not have any symbols that cannot be resolved.
// If we are going to do codegen for link-time optimization, check for
// unresolvable symbols first, so we don't spend time generating code that
// will fail to link anyway.
if (!BitcodeFile::instances.empty() && !config->forceUnresolved)
if (errorCount())
// Do LTO by compiling bitcode input files to a set of native COFF files then
// link those files (unless -thinlto-index-only was given, in which case we
// resolve symbols and write indices, but don't generate native code or link).
// If -thinlto-index-only is given, we should create only "index
// files" and not object files. Index file creation is already done
// in addCombinedLTOObject, so we are done if that's the case.
if (config->thinLTOIndexOnly)
// If we generated native object files from bitcode files, this resolves
// references to the symbols we use from them.
// Resolve remaining undefined symbols and warn about imported locals.
if (errorCount())
config->hadExplicitExports = !config->exports.empty();
if (config->mingw) {
// In MinGW, all symbols are automatically exported if no symbols
// are chosen to be exported.
// Make sure the crtend.o object is the last object file. This object
// file can contain terminating section chunks that need to be placed
// last. GNU ld processes files and static libraries explicitly in the
// order provided on the command line, while lld will pull in needed
// files from static libraries only after the last object file on the
// command line.
for (auto i = ObjFile::instances.begin(), e = ObjFile::instances.end();
i != e; i++) {
ObjFile *file = *i;
if (isCrtend(file->getName())) {
// Windows specific -- when we are creating a .dll file, we also
// need to create a .lib file. In MinGW mode, we only do that when the
// -implib option is given explicitly, for compatibility with GNU ld.
if (!config->exports.empty() || config->dll) {
if (!config->mingw || !config->implib.empty())
// Handle /output-def (MinGW specific).
if (auto *arg = args.getLastArg(OPT_output_def))
// Set extra alignment for .comm symbols
for (auto pair : config->alignComm) {
StringRef name = pair.first;
uint32_t alignment = pair.second;
Symbol *sym = symtab->find(name);
if (!sym) {
warn("/aligncomm symbol " + name + " not found");
// If the symbol isn't common, it must have been replaced with a regular
// symbol, which will carry its own alignment.
auto *dc = dyn_cast<DefinedCommon>(sym);
if (!dc)
CommonChunk *c = dc->getChunk();
c->setAlignment(std::max(c->getAlignment(), alignment));
// Windows specific -- Create a side-by-side manifest file.
if (config->manifest == Configuration::SideBySide)
// Handle /order. We want to do this at this moment because we
// need a complete list of comdat sections to warn on nonexistent
// functions.
if (auto *arg = args.getLastArg(OPT_order))
// Identify unreferenced COMDAT sections.
if (config->doGC)
// Needs to happen after the last call to addFile().
// Identify identical COMDAT sections to merge them.
if (config->doICF) {
// Write the result.
// Stop early so we can print the results.
if (config->showTiming)
} // namespace coff
} // namespace lld
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 0adc2b91bd99..4346b3a2ffa7 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1,1064 +1,1064 @@
//===- InputFiles.cpp -----------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "InputFiles.h"
#include "Chunks.h"
#include "Config.h"
#include "DebugTypes.h"
#include "Driver.h"
#include "SymbolTable.h"
#include "Symbols.h"
#include "lld/Common/DWARF.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Memory.h"
#include "llvm-c/lto.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
#include "llvm/LTO/LTO.h"
#include "llvm/Object/Binary.h"
#include "llvm/Object/COFF.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorOr.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Target/TargetOptions.h"
#include <cstring>
#include <system_error>
#include <utility>
using namespace llvm;
using namespace llvm::COFF;
using namespace llvm::codeview;
using namespace llvm::object;
using namespace llvm::support::endian;
using namespace lld;
using namespace lld::coff;
using llvm::Triple;
using llvm::support::ulittle32_t;
// Returns the last element of a path, which is supposed to be a filename.
static StringRef getBasename(StringRef path) {
return sys::path::filename(path, sys::path::Style::windows);
// Returns a string in the format of "foo.obj" or "foo.obj(bar.lib)".
std::string lld::toString(const coff::InputFile *file) {
if (!file)
return "<internal>";
if (file->parentName.empty() || file->kind() == coff::InputFile::ImportKind)
return std::string(file->getName());
return (getBasename(file->parentName) + "(" + getBasename(file->getName()) +
std::vector<ObjFile *> ObjFile::instances;
std::map<std::string, PDBInputFile *> PDBInputFile::instances;
std::vector<ImportFile *> ImportFile::instances;
std::vector<BitcodeFile *> BitcodeFile::instances;
/// Checks that Source is compatible with being a weak alias to Target.
/// If Source is Undefined and has no weak alias set, makes it a weak
/// alias to Target.
static void checkAndSetWeakAlias(SymbolTable *symtab, InputFile *f,
Symbol *source, Symbol *target) {
if (auto *u = dyn_cast<Undefined>(source)) {
if (u->weakAlias && u->weakAlias != target) {
// Weak aliases as produced by GCC are named in the form
// .weak.<weaksymbol>.<othersymbol>, where <othersymbol> is the name
// of another symbol emitted near the weak symbol.
// Just use the definition from the first object file that defined
// this weak symbol.
if (config->mingw)
symtab->reportDuplicate(source, f);
u->weakAlias = target;
static bool ignoredSymbolName(StringRef name) {
return name == "@feat.00" || name == "";
ArchiveFile::ArchiveFile(MemoryBufferRef m) : InputFile(ArchiveKind, m) {}
void ArchiveFile::parse() {
// Parse a MemoryBufferRef as an archive file.
file = CHECK(Archive::create(mb), this);
// Read the symbol table to construct Lazy objects.
for (const Archive::Symbol &sym : file->symbols())
symtab->addLazyArchive(this, sym);
// Returns a buffer pointing to a member file containing a given symbol.
void ArchiveFile::addMember(const Archive::Symbol &sym) {
const Archive::Child &c =
"could not get the member for symbol " + toCOFFString(sym));
// Return an empty buffer if we have already returned the same buffer.
if (!seen.insert(c.getChildOffset()).second)
driver->enqueueArchiveMember(c, sym, getName());
std::vector<MemoryBufferRef> lld::coff::getArchiveMembers(Archive *file) {
std::vector<MemoryBufferRef> v;
Error err = Error::success();
for (const Archive::Child &c : file->children(err)) {
MemoryBufferRef mbref =
file->getFileName() +
": could not get the buffer for a child of the archive");
if (err)
fatal(file->getFileName() +
": Archive::children failed: " + toString(std::move(err)));
return v;
void LazyObjFile::fetch() {
if (mb.getBuffer().empty())
InputFile *file;
if (isBitcode(mb))
file = make<BitcodeFile>(mb, "", 0, std::move(symbols));
file = make<ObjFile>(mb, std::move(symbols));
mb = {};
void LazyObjFile::parse() {
if (isBitcode(this->mb)) {
// Bitcode file.
std::unique_ptr<lto::InputFile> obj =
CHECK(lto::InputFile::create(this->mb), this);
for (const lto::InputFile::Symbol &sym : obj->symbols()) {
if (!sym.isUndefined())
symtab->addLazyObject(this, sym.getName());
// Native object file.
std::unique_ptr<Binary> coffObjPtr = CHECK(createBinary(mb), this);
COFFObjectFile *coffObj = cast<COFFObjectFile>(coffObjPtr.get());
uint32_t numSymbols = coffObj->getNumberOfSymbols();
for (uint32_t i = 0; i < numSymbols; ++i) {
COFFSymbolRef coffSym = check(coffObj->getSymbol(i));
if (coffSym.isUndefined() || !coffSym.isExternal() ||
StringRef name = check(coffObj->getSymbolName(coffSym));
if (coffSym.isAbsolute() && ignoredSymbolName(name))
symtab->addLazyObject(this, name);
i += coffSym.getNumberOfAuxSymbols();
void ObjFile::parse() {
// Parse a memory buffer as a COFF file.
std::unique_ptr<Binary> bin = CHECK(createBinary(mb), this);
if (auto *obj = dyn_cast<COFFObjectFile>(bin.get())) {
} else {
fatal(toString(this) + " is not a COFF file");
// Read section and symbol tables.
const coff_section *ObjFile::getSection(uint32_t i) {
auto sec = coffObj->getSection(i);
if (!sec)
fatal("getSection failed: #" + Twine(i) + ": " + toString(sec.takeError()));
return *sec;
// We set SectionChunk pointers in the SparseChunks vector to this value
// temporarily to mark comdat sections as having an unknown resolution. As we
// walk the object file's symbol table, once we visit either a leader symbol or
// an associative section definition together with the parent comdat's leader,
// we set the pointer to either nullptr (to mark the section as discarded) or a
// valid SectionChunk for that section.
static SectionChunk *const pendingComdat = reinterpret_cast<SectionChunk *>(1);
void ObjFile::initializeChunks() {
uint32_t numSections = coffObj->getNumberOfSections();
sparseChunks.resize(numSections + 1);
for (uint32_t i = 1; i < numSections + 1; ++i) {
const coff_section *sec = getSection(i);
if (sec->Characteristics & IMAGE_SCN_LNK_COMDAT)
sparseChunks[i] = pendingComdat;
sparseChunks[i] = readSection(i, nullptr, "");
SectionChunk *ObjFile::readSection(uint32_t sectionNumber,
const coff_aux_section_definition *def,
StringRef leaderName) {
const coff_section *sec = getSection(sectionNumber);
StringRef name;
if (Expected<StringRef> e = coffObj->getSectionName(sec))
name = *e;
fatal("getSectionName failed: #" + Twine(sectionNumber) + ": " +
if (name == ".drectve") {
ArrayRef<uint8_t> data;
cantFail(coffObj->getSectionContents(sec, data));
directives = StringRef((const char *), data.size());
return nullptr;
if (name == ".llvm_addrsig") {
addrsigSec = sec;
return nullptr;
// Object files may have DWARF debug info or MS CodeView debug info
// (or both).
// DWARF sections don't need any special handling from the perspective
// of the linker; they are just a data section containing relocations.
// We can just link them to complete debug info.
// CodeView needs linker support. We need to interpret debug info,
// and then write it to a separate .pdb file.
// Ignore DWARF debug info unless /debug is given.
if (!config->debug && name.startswith(".debug_"))
return nullptr;
if (sec->Characteristics & llvm::COFF::IMAGE_SCN_LNK_REMOVE)
return nullptr;
auto *c = make<SectionChunk>(this, sec);
if (def)
c->checksum = def->CheckSum;
// CodeView sections are stored to a different vector because they are not
// linked in the regular manner.
if (c->isCodeView())
else if (name == ".gfids$y")
else if (name == ".gljmp$y")
else if (name == ".sxdata")
else if (config->tailMerge && sec->NumberOfRelocations == 0 &&
name == ".rdata" && leaderName.startswith("??_C@"))
// COFF sections that look like string literal sections (i.e. no
// relocations, in .rdata, leader symbol name matches the MSVC name mangling
// for string literals) are subject to string tail merging.
else if (name == ".rsrc" || name.startswith(".rsrc$"))
return c;
void ObjFile::includeResourceChunks() {
chunks.insert(chunks.end(), resourceChunks.begin(), resourceChunks.end());
void ObjFile::readAssociativeDefinition(
COFFSymbolRef sym, const coff_aux_section_definition *def) {
readAssociativeDefinition(sym, def, def->getNumber(sym.isBigObj()));
void ObjFile::readAssociativeDefinition(COFFSymbolRef sym,
const coff_aux_section_definition *def,
uint32_t parentIndex) {
SectionChunk *parent = sparseChunks[parentIndex];
int32_t sectionNumber = sym.getSectionNumber();
auto diag = [&]() {
StringRef name = check(coffObj->getSymbolName(sym));
StringRef parentName;
const coff_section *parentSec = getSection(parentIndex);
if (Expected<StringRef> e = coffObj->getSectionName(parentSec))
parentName = *e;
error(toString(this) + ": associative comdat " + name + " (sec " +
Twine(sectionNumber) + ") has invalid reference to section " +
parentName + " (sec " + Twine(parentIndex) + ")");
if (parent == pendingComdat) {
// This can happen if an associative comdat refers to another associative
// comdat that appears after it (invalid per COFF spec) or to a section
// without any symbols.
// Check whether the parent is prevailing. If it is, so are we, and we read
// the section; otherwise mark it as discarded.
if (parent) {
SectionChunk *c = readSection(sectionNumber, def, "");
sparseChunks[sectionNumber] = c;
if (c) {
} else {
sparseChunks[sectionNumber] = nullptr;
void ObjFile::recordPrevailingSymbolForMingw(
COFFSymbolRef sym, DenseMap<StringRef, uint32_t> &prevailingSectionMap) {
// For comdat symbols in executable sections, where this is the copy
// of the section chunk we actually include instead of discarding it,
// add the symbol to a map to allow using it for implicitly
// associating .[px]data$<func> sections to it.
+ // Use the suffix from the .text$<func> instead of the leader symbol
+ // name, for cases where the names differ (i386 mangling/decorations,
+ // cases where the leader is a weak symbol named .weak.func.default*).
int32_t sectionNumber = sym.getSectionNumber();
SectionChunk *sc = sparseChunks[sectionNumber];
if (sc && sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE) {
- StringRef name;
- name = check(coffObj->getSymbolName(sym));
- if (getMachineType() == I386)
- name.consume_front("_");
+ StringRef name = sc->getSectionName().split('$').second;
prevailingSectionMap[name] = sectionNumber;
void ObjFile::maybeAssociateSEHForMingw(
COFFSymbolRef sym, const coff_aux_section_definition *def,
const DenseMap<StringRef, uint32_t> &prevailingSectionMap) {
StringRef name = check(coffObj->getSymbolName(sym));
if (name.consume_front(".pdata$") || name.consume_front(".xdata$") ||
name.consume_front(".eh_frame$")) {
// For MinGW, treat .[px]data$<func> and .eh_frame$<func> as implicitly
// associative to the symbol <func>.
auto parentSym = prevailingSectionMap.find(name);
if (parentSym != prevailingSectionMap.end())
readAssociativeDefinition(sym, def, parentSym->second);
Symbol *ObjFile::createRegular(COFFSymbolRef sym) {
SectionChunk *sc = sparseChunks[sym.getSectionNumber()];
if (sym.isExternal()) {
StringRef name = check(coffObj->getSymbolName(sym));
if (sc)
return symtab->addRegular(this, name, sym.getGeneric(), sc,
// For MinGW symbols named .weak.* that point to a discarded section,
// don't create an Undefined symbol. If nothing ever refers to the symbol,
// everything should be fine. If something actually refers to the symbol
// (e.g. the undefined weak alias), linking will fail due to undefined
// references at the end.
if (config->mingw && name.startswith(".weak."))
return nullptr;
return symtab->addUndefined(name, this, false);
if (sc)
return make<DefinedRegular>(this, /*Name*/ "", /*IsCOMDAT*/ false,
/*IsExternal*/ false, sym.getGeneric(), sc);
return nullptr;
void ObjFile::initializeSymbols() {
uint32_t numSymbols = coffObj->getNumberOfSymbols();
SmallVector<std::pair<Symbol *, uint32_t>, 8> weakAliases;
std::vector<uint32_t> pendingIndexes;
DenseMap<StringRef, uint32_t> prevailingSectionMap;
std::vector<const coff_aux_section_definition *> comdatDefs(
coffObj->getNumberOfSections() + 1);
for (uint32_t i = 0; i < numSymbols; ++i) {
COFFSymbolRef coffSym = check(coffObj->getSymbol(i));
bool prevailingComdat;
if (coffSym.isUndefined()) {
symbols[i] = createUndefined(coffSym);
} else if (coffSym.isWeakExternal()) {
symbols[i] = createUndefined(coffSym);
uint32_t tagIndex = coffSym.getAux<coff_aux_weak_external>()->TagIndex;
weakAliases.emplace_back(symbols[i], tagIndex);
} else if (Optional<Symbol *> optSym =
createDefined(coffSym, comdatDefs, prevailingComdat)) {
symbols[i] = *optSym;
if (config->mingw && prevailingComdat)
recordPrevailingSymbolForMingw(coffSym, prevailingSectionMap);
} else {
// createDefined() returns None if a symbol belongs to a section that
// was pending at the point when the symbol was read. This can happen in
// two cases:
// 1) section definition symbol for a comdat leader;
// 2) symbol belongs to a comdat section associated with another section.
// In both of these cases, we can expect the section to be resolved by
// the time we finish visiting the remaining symbols in the symbol
// table. So we postpone the handling of this symbol until that time.
i += coffSym.getNumberOfAuxSymbols();
for (uint32_t i : pendingIndexes) {
COFFSymbolRef sym = check(coffObj->getSymbol(i));
if (const coff_aux_section_definition *def = sym.getSectionDefinition()) {
readAssociativeDefinition(sym, def);
else if (config->mingw)
maybeAssociateSEHForMingw(sym, def, prevailingSectionMap);
if (sparseChunks[sym.getSectionNumber()] == pendingComdat) {
StringRef name = check(coffObj->getSymbolName(sym));
log("comdat section " + name +
" without leader and unassociated, discarding");
symbols[i] = createRegular(sym);
for (auto &kv : weakAliases) {
Symbol *sym = kv.first;
uint32_t idx = kv.second;
checkAndSetWeakAlias(symtab, this, sym, symbols[idx]);
// Free the memory used by sparseChunks now that symbol loading is finished.
Symbol *ObjFile::createUndefined(COFFSymbolRef sym) {
StringRef name = check(coffObj->getSymbolName(sym));
return symtab->addUndefined(name, this, sym.isWeakExternal());
void ObjFile::handleComdatSelection(COFFSymbolRef sym, COMDATType &selection,
bool &prevailing, DefinedRegular *leader) {
if (prevailing)
// There's already an existing comdat for this symbol: `Leader`.
// Use the comdats's selection field to determine if the new
// symbol in `Sym` should be discarded, produce a duplicate symbol
// error, etc.
SectionChunk *leaderChunk = nullptr;
if (leader->data) {
leaderChunk = leader->getChunk();
leaderSelection = leaderChunk->selection;
} else {
// FIXME: comdats from LTO files don't know their selection; treat them
// as "any".
selection = leaderSelection;
if ((selection == IMAGE_COMDAT_SELECT_ANY &&
leaderSelection == IMAGE_COMDAT_SELECT_LARGEST) ||
leaderSelection == IMAGE_COMDAT_SELECT_ANY)) {
// cl.exe picks "any" for vftables when building with /GR- and
// "largest" when building with /GR. To be able to link object files
// compiled with each flag, "any" and "largest" are merged as "largest".
leaderSelection = selection = IMAGE_COMDAT_SELECT_LARGEST;
// GCCs __declspec(selectany) doesn't actually pick "any" but "same size as".
// Clang on the other hand picks "any". To be able to link two object files
// with a __declspec(selectany) declaration, one compiled with gcc and the
// other with clang, we merge them as proper "same size as"
if (config->mingw && ((selection == IMAGE_COMDAT_SELECT_ANY &&
leaderSelection == IMAGE_COMDAT_SELECT_SAME_SIZE) ||
leaderSelection == IMAGE_COMDAT_SELECT_ANY))) {
leaderSelection = selection = IMAGE_COMDAT_SELECT_SAME_SIZE;
// Other than that, comdat selections must match. This is a bit more
// strict than link.exe which allows merging "any" and "largest" if "any"
// is the first symbol the linker sees, and it allows merging "largest"
// with everything (!) if "largest" is the first symbol the linker sees.
// Making this symmetric independent of which selection is seen first
// seems better though.
// (This behavior matches ModuleLinker::getComdatResult().)
if (selection != leaderSelection) {
log(("conflicting comdat type for " + toString(*leader) + ": " +
Twine((int)leaderSelection) + " in " + toString(leader->getFile()) +
" and " + Twine((int)selection) + " in " + toString(this))
symtab->reportDuplicate(leader, this);
switch (selection) {
symtab->reportDuplicate(leader, this);
// Nothing to do.
if (leaderChunk->getSize() != getSection(sym)->SizeOfRawData)
symtab->reportDuplicate(leader, this);
SectionChunk newChunk(this, getSection(sym));
// link.exe only compares section contents here and doesn't complain
// if the two comdat sections have e.g. different alignment.
// Match that.
if (leaderChunk->getContents() != newChunk.getContents())
symtab->reportDuplicate(leader, this, &newChunk, sym.getValue());
// createDefined() is never called for IMAGE_COMDAT_SELECT_ASSOCIATIVE.
// (This means lld-link doesn't produce duplicate symbol errors for
// associative comdats while link.exe does, but associate comdats
// are never extern in practice.)
llvm_unreachable("createDefined not called for associative comdats");
if (leaderChunk->getSize() < getSection(sym)->SizeOfRawData) {
// Replace the existing comdat symbol with the new one.
StringRef name = check(coffObj->getSymbolName(sym));
// FIXME: This is incorrect: With /opt:noref, the previous sections
// make it into the final executable as well. Correct handling would
// be to undo reading of the whole old section that's being replaced,
// or doing one pass that determines what the final largest comdat
// is for all IMAGE_COMDAT_SELECT_LARGEST comdats and then reading
// only the largest one.
replaceSymbol<DefinedRegular>(leader, this, name, /*IsCOMDAT*/ true,
/*IsExternal*/ true, sym.getGeneric(),
prevailing = true;
llvm_unreachable("should have been rejected earlier");
Optional<Symbol *> ObjFile::createDefined(
COFFSymbolRef sym,
std::vector<const coff_aux_section_definition *> &comdatDefs,
bool &prevailing) {
prevailing = false;
auto getName = [&]() { return check(coffObj->getSymbolName(sym)); };
if (sym.isCommon()) {
auto *c = make<CommonChunk>(sym);
return symtab->addCommon(this, getName(), sym.getValue(), sym.getGeneric(),
if (sym.isAbsolute()) {
StringRef name = getName();
if (name == "@feat.00")
feat00Flags = sym.getValue();
// Skip special symbols.
if (ignoredSymbolName(name))
return nullptr;
if (sym.isExternal())
return symtab->addAbsolute(name, sym);
return make<DefinedAbsolute>(name, sym);
int32_t sectionNumber = sym.getSectionNumber();
if (sectionNumber == llvm::COFF::IMAGE_SYM_DEBUG)
return nullptr;
if (llvm::COFF::isReservedSectionNumber(sectionNumber))
fatal(toString(this) + ": " + getName() +
" should not refer to special section " + Twine(sectionNumber));
if ((uint32_t)sectionNumber >= sparseChunks.size())
fatal(toString(this) + ": " + getName() +
" should not refer to non-existent section " + Twine(sectionNumber));
// Comdat handling.
// A comdat symbol consists of two symbol table entries.
// The first symbol entry has the name of the section (e.g. .text), fixed
// values for the other fields, and one auxiliary record.
// The second symbol entry has the name of the comdat symbol, called the
// "comdat leader".
// When this function is called for the first symbol entry of a comdat,
// it sets comdatDefs and returns None, and when it's called for the second
// symbol entry it reads comdatDefs and then sets it back to nullptr.
// Handle comdat leader.
if (const coff_aux_section_definition *def = comdatDefs[sectionNumber]) {
comdatDefs[sectionNumber] = nullptr;
DefinedRegular *leader;
if (sym.isExternal()) {
std::tie(leader, prevailing) =
symtab->addComdat(this, getName(), sym.getGeneric());
} else {
leader = make<DefinedRegular>(this, /*Name*/ "", /*IsCOMDAT*/ false,
/*IsExternal*/ false, sym.getGeneric());
prevailing = true;
if (def->Selection < (int)IMAGE_COMDAT_SELECT_NODUPLICATES ||
// Intentionally ends at IMAGE_COMDAT_SELECT_LARGEST: link.exe
// doesn't understand IMAGE_COMDAT_SELECT_NEWEST either.
def->Selection > (int)IMAGE_COMDAT_SELECT_LARGEST) {
fatal("unknown comdat type " + std::to_string((int)def->Selection) +
" for " + getName() + " in " + toString(this));
COMDATType selection = (COMDATType)def->Selection;
if (leader->isCOMDAT)
handleComdatSelection(sym, selection, prevailing, leader);
if (prevailing) {
SectionChunk *c = readSection(sectionNumber, def, getName());
sparseChunks[sectionNumber] = c;
c->sym = cast<DefinedRegular>(leader);
c->selection = selection;
cast<DefinedRegular>(leader)->data = &c->repl;
} else {
sparseChunks[sectionNumber] = nullptr;
return leader;
// Prepare to handle the comdat leader symbol by setting the section's
// ComdatDefs pointer if we encounter a non-associative comdat.
if (sparseChunks[sectionNumber] == pendingComdat) {
if (const coff_aux_section_definition *def = sym.getSectionDefinition()) {
comdatDefs[sectionNumber] = def;
return None;
return createRegular(sym);
MachineTypes ObjFile::getMachineType() {
if (coffObj)
return static_cast<MachineTypes>(coffObj->getMachine());
ArrayRef<uint8_t> ObjFile::getDebugSection(StringRef secName) {
if (SectionChunk *sec = SectionChunk::findByName(debugChunks, secName))
return sec->consumeDebugMagic();
return {};
// OBJ files systematically store critical information in a .debug$S stream,
// even if the TU was compiled with no debug info. At least two records are
// always there. S_OBJNAME stores a 32-bit signature, which is loaded into the
// PCHSignature member. S_COMPILE3 stores compile-time cmd-line flags. This is
// currently used to initialize the hotPatchable member.
void ObjFile::initializeFlags() {
ArrayRef<uint8_t> data = getDebugSection(".debug$S");
if (data.empty())
DebugSubsectionArray subsections;
BinaryStreamReader reader(data, support::little);
ExitOnError exitOnErr;
exitOnErr(reader.readArray(subsections, data.size()));
for (const DebugSubsectionRecord &ss : subsections) {
if (ss.kind() != DebugSubsectionKind::Symbols)
unsigned offset = 0;
// Only parse the first two records. We are only looking for S_OBJNAME
// and S_COMPILE3, and they usually appear at the beginning of the
// stream.
for (unsigned i = 0; i < 2; ++i) {
Expected<CVSymbol> sym = readSymbolFromStream(ss.getRecordData(), offset);
if (!sym) {
if (sym->kind() == SymbolKind::S_COMPILE3) {
auto cs =
hotPatchable =
(cs.Flags & CompileSym3Flags::HotPatch) != CompileSym3Flags::None;
if (sym->kind() == SymbolKind::S_OBJNAME) {
auto objName = cantFail(SymbolDeserializer::deserializeAs<ObjNameSym>(
pchSignature = objName.Signature;
offset += sym->length();
// Depending on the compilation flags, OBJs can refer to external files,
// necessary to merge this OBJ into the final PDB. We currently support two
// types of external files: Precomp/PCH OBJs, when compiling with /Yc and /Yu.
// And PDB type servers, when compiling with /Zi. This function extracts these
// dependencies and makes them available as a TpiSource interface (see
// DebugTypes.h). Both cases only happen with cl.exe: clang-cl produces regular
// output even with /Yc and /Yu and with /Zi.
void ObjFile::initializeDependencies() {
if (!config->debug)
bool isPCH = false;
ArrayRef<uint8_t> data = getDebugSection(".debug$P");
if (!data.empty())
isPCH = true;
data = getDebugSection(".debug$T");
if (data.empty())
// Get the first type record. It will indicate if this object uses a type
// server (/Zi) or a PCH file (/Yu).
CVTypeArray types;
BinaryStreamReader reader(data, support::little);
cantFail(reader.readArray(types, reader.getLength()));
CVTypeArray::Iterator firstType = types.begin();
if (firstType == types.end())
// Remember the .debug$T or .debug$P section.
debugTypes = data;
// This object file is a PCH file that others will depend on.
if (isPCH) {
debugTypesObj = makePrecompSource(this);
// This object file was compiled with /Zi. Enqueue the PDB dependency.
if (firstType->kind() == LF_TYPESERVER2) {
TypeServer2Record ts = cantFail(
debugTypesObj = makeUseTypeServerSource(this, ts);
PDBInputFile::enqueue(ts.getName(), this);
// This object was compiled with /Yu. It uses types from another object file
// with a matching signature.
if (firstType->kind() == LF_PRECOMP) {
PrecompRecord precomp = cantFail(
debugTypesObj = makeUsePrecompSource(this, precomp);
// This is a plain old object file.
debugTypesObj = makeTpiSource(this);
// Make a PDB path assuming the PDB is in the same folder as the OBJ
static std::string getPdbBaseName(ObjFile *file, StringRef tSPath) {
StringRef localPath =
!file->parentName.empty() ? file->parentName : file->getName();
SmallString<128> path = sys::path::parent_path(localPath);
// Currently, type server PDBs are only created by MSVC cl, which only runs
// on Windows, so we can assume type server paths are Windows style.
sys::path::filename(tSPath, sys::path::Style::windows));
return std::string(path.str());
// The casing of the PDB path stamped in the OBJ can differ from the actual path
// on disk. With this, we ensure to always use lowercase as a key for the
// PDBInputFile::instances map, at least on Windows.
static std::string normalizePdbPath(StringRef path) {
#if defined(_WIN32)
return path.lower();
#else // LINUX
return std::string(path);
// If existing, return the actual PDB path on disk.
static Optional<std::string> findPdbPath(StringRef pdbPath,
ObjFile *dependentFile) {
// Ensure the file exists before anything else. In some cases, if the path
// points to a removable device, Driver::enqueuePath() would fail with an
// error (EAGAIN, "resource unavailable try again") which we want to skip
// silently.
if (llvm::sys::fs::exists(pdbPath))
return normalizePdbPath(pdbPath);
std::string ret = getPdbBaseName(dependentFile, pdbPath);
if (llvm::sys::fs::exists(ret))
return normalizePdbPath(ret);
return None;
PDBInputFile::PDBInputFile(MemoryBufferRef m) : InputFile(PDBKind, m) {}
PDBInputFile::~PDBInputFile() = default;
PDBInputFile *PDBInputFile::findFromRecordPath(StringRef path,
ObjFile *fromFile) {
auto p = findPdbPath(path.str(), fromFile);
if (!p)
return nullptr;
auto it = PDBInputFile::instances.find(*p);
if (it != PDBInputFile::instances.end())
return it->second;
return nullptr;
void PDBInputFile::enqueue(StringRef path, ObjFile *fromFile) {
auto p = findPdbPath(path.str(), fromFile);
if (!p)
auto it = PDBInputFile::instances.emplace(*p, nullptr);
if (!it.second)
return; // already scheduled for load
void PDBInputFile::parse() {
PDBInputFile::instances[mb.getBufferIdentifier().str()] = this;
std::unique_ptr<pdb::IPDBSession> thisSession;
MemoryBuffer::getMemBuffer(mb, false), thisSession));
if (*loadErr)
return; // fail silently at this point - the error will be handled later,
// when merging the debug type stream
session.reset(static_cast<pdb::NativeSession *>(thisSession.release()));
pdb::PDBFile &pdbFile = session->getPDBFile();
auto expectedInfo = pdbFile.getPDBInfoStream();
// All PDB Files should have an Info stream.
if (!expectedInfo) {
debugTypesObj = makeTypeServerSource(this);
// Used only for DWARF debug info, which is not common (except in MinGW
// environments). This returns an optional pair of file name and line
// number for where the variable was defined.
Optional<std::pair<StringRef, uint32_t>>
ObjFile::getVariableLocation(StringRef var) {
if (!dwarf) {
dwarf = make<DWARFCache>(DWARFContext::create(*getCOFFObj()));
if (!dwarf)
return None;
if (config->machine == I386)
Optional<std::pair<std::string, unsigned>> ret = dwarf->getVariableLoc(var);
if (!ret)
return None;
return std::make_pair(>first), ret->second);
// Used only for DWARF debug info, which is not common (except in MinGW
// environments).
Optional<DILineInfo> ObjFile::getDILineInfo(uint32_t offset,
uint32_t sectionIndex) {
if (!dwarf) {
dwarf = make<DWARFCache>(DWARFContext::create(*getCOFFObj()));
if (!dwarf)
return None;
return dwarf->getDILineInfo(offset, sectionIndex);
static StringRef ltrim1(StringRef s, const char *chars) {
if (!s.empty() && strchr(chars, s[0]))
return s.substr(1);
return s;
void ImportFile::parse() {
const char *buf = mb.getBufferStart();
const auto *hdr = reinterpret_cast<const coff_import_header *>(buf);
// Check if the total size is valid.
if (mb.getBufferSize() != sizeof(*hdr) + hdr->SizeOfData)
fatal("broken import library");
// Read names and create an __imp_ symbol.
StringRef name = + sizeof(*hdr)));
StringRef impName ="__imp_" + name);
const char *nameStart = buf + sizeof(coff_import_header) + name.size() + 1;
dllName = std::string(StringRef(nameStart));
StringRef extName;
switch (hdr->getNameType()) {
extName = "";
extName = name;
extName = ltrim1(name, "?@_");
extName = ltrim1(name, "?@_");
extName = extName.substr(0, extName.find('@'));
this->hdr = hdr;
externalName = extName;
impSym = symtab->addImportData(impName, this);
// If this was a duplicate, we logged an error but may continue;
// in this case, impSym is nullptr.
if (!impSym)
if (hdr->getType() == llvm::COFF::IMPORT_CONST)
static_cast<void>(symtab->addImportData(name, this));
// If type is function, we need to create a thunk which jump to an
// address pointed by the __imp_ symbol. (This allows you to call
// DLL functions just like regular non-DLL functions.)
if (hdr->getType() == llvm::COFF::IMPORT_CODE)
thunkSym = symtab->addImportThunk(
name, cast_or_null<DefinedImportData>(impSym), hdr->Machine);
BitcodeFile::BitcodeFile(MemoryBufferRef mb, StringRef archiveName,
uint64_t offsetInArchive)
: BitcodeFile(mb, archiveName, offsetInArchive, {}) {}
BitcodeFile::BitcodeFile(MemoryBufferRef mb, StringRef archiveName,
uint64_t offsetInArchive,
std::vector<Symbol *> &&symbols)
: InputFile(BitcodeKind, mb), symbols(std::move(symbols)) {
std::string path = mb.getBufferIdentifier().str();
if (config->thinLTOIndexOnly)
path = replaceThinLTOSuffix(mb.getBufferIdentifier());
// ThinLTO assumes that all MemoryBufferRefs given to it have a unique
// name. If two archives define two members with the same name, this
// causes a collision which result in only one of the objects being taken
// into consideration at LTO time (which very likely causes undefined
// symbols later in the link stage). So we append file offset to make
// filename unique.
MemoryBufferRef mbref(
mb.getBuffer(), ? path
: archiveName + sys::path::filename(path) +
obj = check(lto::InputFile::create(mbref));
BitcodeFile::~BitcodeFile() = default;
void BitcodeFile::parse() {
std::vector<std::pair<Symbol *, bool>> comdat(obj->getComdatTable().size());
for (size_t i = 0; i != obj->getComdatTable().size(); ++i)
// FIXME: lto::InputFile doesn't keep enough data to do correct comdat
// selection handling.
comdat[i] = symtab->addComdat(this,>getComdatTable()[i]));
for (const lto::InputFile::Symbol &objSym : obj->symbols()) {
StringRef symName =;
int comdatIndex = objSym.getComdatIndex();
Symbol *sym;
if (objSym.isUndefined()) {
sym = symtab->addUndefined(symName, this, false);
} else if (objSym.isCommon()) {
sym = symtab->addCommon(this, symName, objSym.getCommonSize());
} else if (objSym.isWeak() && objSym.isIndirect()) {
// Weak external.
sym = symtab->addUndefined(symName, this, true);
std::string fallback = std::string(objSym.getCOFFWeakExternalFallback());
Symbol *alias = symtab->addUndefined(;
checkAndSetWeakAlias(symtab, this, sym, alias);
} else if (comdatIndex != -1) {
if (symName == obj->getComdatTable()[comdatIndex])
sym = comdat[comdatIndex].first;
else if (comdat[comdatIndex].second)
sym = symtab->addRegular(this, symName);
sym = symtab->addUndefined(symName, this, false);
} else {
sym = symtab->addRegular(this, symName);
if (objSym.isUsed())
directives = obj->getCOFFLinkerOpts();
MachineTypes BitcodeFile::getMachineType() {
switch (Triple(obj->getTargetTriple()).getArch()) {
case Triple::x86_64:
return AMD64;
case Triple::x86:
return I386;
case Triple::arm:
return ARMNT;
case Triple::aarch64:
return ARM64;
std::string lld::coff::replaceThinLTOSuffix(StringRef path) {
StringRef suffix = config->thinLTOObjectSuffixReplace.first;
StringRef repl = config->thinLTOObjectSuffixReplace.second;
if (path.consume_back(suffix))
return (path + repl).str();
return std::string(path);
diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
index bded985f04d0..e24cdca6ee34 100644
--- a/lld/COFF/MinGW.cpp
+++ b/lld/COFF/MinGW.cpp
@@ -1,166 +1,175 @@
//===- MinGW.cpp ----------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "MinGW.h"
#include "SymbolTable.h"
#include "lld/Common/ErrorHandler.h"
#include "llvm/Object/COFF.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
using namespace llvm::COFF;
using namespace lld;
using namespace lld::coff;
AutoExporter::AutoExporter() {
excludeLibs = {
+ "libclang_rt.profile",
+ "libclang_rt.profile-aarch64",
+ "libclang_rt.profile-arm",
+ "libclang_rt.profile-i386",
+ "libclang_rt.profile-x86_64",
excludeObjects = {
"crt0.o", "crt1.o", "crt1u.o", "crt2.o", "crt2u.o", "dllcrt1.o",
"dllcrt2.o", "gcrt0.o", "gcrt1.o", "gcrt2.o", "crtbegin.o", "crtend.o",
excludeSymbolPrefixes = {
// Import symbols
// Extra import symbols from GNU import libraries
// C++ symbols
// Artificial symbols such as .refptr
+ // profile generate symbols
+ "__profc_",
+ "__profd_",
+ "__profvp_",
excludeSymbolSuffixes = {
if (config->machine == I386) {
excludeSymbols = {
// These are the MinGW names that differ from the standard
// ones (lacking an extra underscore).
} else {
excludeSymbols = {
// These are the MinGW names that differ from the standard
// ones (lacking an extra underscore).
void AutoExporter::addWholeArchive(StringRef path) {
StringRef libName = sys::path::filename(path);
// Drop the file extension, to match the processing below.
libName = libName.substr(0, libName.rfind('.'));
bool AutoExporter::shouldExport(Defined *sym) const {
if (!sym || !sym->isLive() || !sym->getChunk())
return false;
// Only allow the symbol kinds that make sense to export; in particular,
// disallow import symbols.
if (!isa<DefinedRegular>(sym) && !isa<DefinedCommon>(sym))
return false;
if (excludeSymbols.count(sym->getName()))
return false;
for (StringRef prefix : excludeSymbolPrefixes.keys())
if (sym->getName().startswith(prefix))
return false;
for (StringRef suffix : excludeSymbolSuffixes.keys())
if (sym->getName().endswith(suffix))
return false;
// If a corresponding __imp_ symbol exists and is defined, don't export it.
if (symtab->find(("__imp_" + sym->getName()).str()))
return false;
// Check that file is non-null before dereferencing it, symbols not
// originating in regular object files probably shouldn't be exported.
if (!sym->getFile())
return false;
StringRef libName = sys::path::filename(sym->getFile()->parentName);
// Drop the file extension.
libName = libName.substr(0, libName.rfind('.'));
if (!libName.empty())
return !excludeLibs.count(libName);
StringRef fileName = sys::path::filename(sym->getFile()->getName());
return !excludeObjects.count(fileName);
void lld::coff::writeDefFile(StringRef name) {
std::error_code ec;
raw_fd_ostream os(name, ec, sys::fs::OF_None);
if (ec)
fatal("cannot open " + name + ": " + ec.message());
os << "EXPORTS\n";
for (Export &e : config->exports) {
os << " " << e.exportName << " "
<< "@" << e.ordinal;
if (auto *def = dyn_cast_or_null<Defined>(e.sym)) {
if (def && def->getChunk() &&
!(def->getChunk()->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE))
os << " DATA";
os << "\n";
diff --git a/lld/COFF/ b/lld/COFF/
index 212879e1d60b..087d53b5d2dd 100644
--- a/lld/COFF/
+++ b/lld/COFF/
@@ -1,265 +1,266 @@
include "llvm/Option/"
// link.exe accepts options starting with either a dash or a slash.
// Flag that takes no arguments.
class F<string name> : Flag<["/", "-", "/?", "-?"], name>;
// Flag that takes one argument after ":".
class P<string name, string help> :
Joined<["/", "-", "/?", "-?"], name#":">, HelpText<help>;
// Boolean flag which can be suffixed by ":no". Using it unsuffixed turns the
// flag on and using it suffixed by ":no" turns it off.
multiclass B<string name, string help_on, string help_off> {
def "" : F<name>, HelpText<help_on>;
def _no : F<name#":no">, HelpText<help_off>;
// Same as B<> above, but without help texts, for private undocumented
// options.
multiclass B_priv<string name> {
def "" : F<name>;
def _no : F<name#":no">;
def align : P<"align", "Section alignment">;
def aligncomm : P<"aligncomm", "Set common symbol alignment">;
def alternatename : P<"alternatename", "Define weak alias">;
def base : P<"base", "Base address of the program">;
def color_diagnostics: Flag<["--"], "color-diagnostics">,
HelpText<"Use colors in diagnostics">;
def color_diagnostics_eq: Joined<["--"], "color-diagnostics=">,
HelpText<"Use colors in diagnostics; one of 'always', 'never', 'auto'">;
def defaultlib : P<"defaultlib", "Add the library to the list of input files">;
def delayload : P<"delayload", "Delay loaded DLL name">;
def entry : P<"entry", "Name of entry point symbol">;
def errorlimit : P<"errorlimit",
"Maximum number of errors to emit before stopping (0 = no limit)">;
def export : P<"export", "Export a function">;
// No help text because /failifmismatch is not intended to be used by the user.
def failifmismatch : P<"failifmismatch", "">;
def filealign : P<"filealign", "Section alignment in the output file">;
def functionpadmin : F<"functionpadmin">;
def functionpadmin_opt : P<"functionpadmin",
"Prepares an image for hotpatching">;
def guard : P<"guard", "Control flow guard">;
def heap : P<"heap", "Size of the heap">;
def ignore : P<"ignore", "Specify warning codes to ignore">;
def implib : P<"implib", "Import library name">;
def lib : F<"lib">,
HelpText<"Act like lib.exe; must be first argument if present">;
def libpath : P<"libpath", "Additional library search path">;
def linkrepro : P<"linkrepro",
"Dump linker invocation and input files for debugging">;
def lldignoreenv : F<"lldignoreenv">,
HelpText<"Ignore environment variables like %LIB%">;
def lldltocache : P<"lldltocache",
"Path to ThinLTO cached object file directory">;
def lldltocachepolicy : P<"lldltocachepolicy",
"Pruning policy for the ThinLTO cache">;
def lldsavetemps : F<"lldsavetemps">,
HelpText<"Save temporary files instead of deleting them">;
def machine : P<"machine", "Specify target platform">;
def merge : P<"merge", "Combine sections">;
def mllvm : P<"mllvm", "Options to pass to LLVM">;
def nodefaultlib : P<"nodefaultlib", "Remove a default library">;
def opt : P<"opt", "Control optimizations">;
def order : P<"order", "Put functions in order">;
def out : P<"out", "Path to file to write output">;
def natvis : P<"natvis", "Path to natvis file to embed in the PDB">;
def no_color_diagnostics: F<"no-color-diagnostics">,
HelpText<"Do not use colors in diagnostics">;
def pdb : P<"pdb", "PDB file path">;
def pdbstripped : P<"pdbstripped", "Stripped PDB file path">;
def pdbaltpath : P<"pdbaltpath", "PDB file path to embed in the image">;
def pdbstream : Joined<["/", "-", "/?", "-?"], "pdbstream:">,
HelpText<"Embed the contents of <file> in the PDB as named stream <name>">;
def section : P<"section", "Specify section attributes">;
def stack : P<"stack", "Size of the stack">;
def stub : P<"stub", "Specify DOS stub file">;
def subsystem : P<"subsystem", "Specify subsystem">;
def timestamp : P<"timestamp", "Specify the PE header timestamp">;
def version : P<"version", "Specify a version number in the PE header">;
def wholearchive_file : P<"wholearchive",
"Include all object files from this library">;
def disallowlib : Joined<["/", "-", "/?", "-?"], "disallowlib:">,
def manifest : F<"manifest">, HelpText<"Create .manifest file">;
def manifest_colon : P<
"NO disables manifest output; EMBED[,ID=#] embeds manifest as resource in the image">;
def manifestuac : P<"manifestuac", "User access control">;
def manifestfile : P<"manifestfile", "Manifest output path, with /manifest">;
def manifestdependency : P<
"Attributes for <dependency> element in manifest file; implies /manifest">;
def manifestinput : P<
"Additional manifest inputs; only valid with /manifest:embed">;
// We cannot use multiclass P because class name "incl" is different
// from its command line option name. We do this because "include" is
// a reserved keyword in tablegen.
def incl : Joined<["/", "-", "/?", "-?"], "include:">,
HelpText<"Force symbol to be added to symbol table as undefined one">;
// "def" is also a keyword.
def deffile : Joined<["/", "-", "/?", "-?"], "def:">,
HelpText<"Use module-definition file">;
def debug : F<"debug">, HelpText<"Embed a symbol table in the image">;
def debug_opt : P<"debug", "Embed a symbol table in the image with option">;
def debugtype : P<"debugtype", "Debug Info Options">;
def dll : F<"dll">, HelpText<"Create a DLL">;
def driver : F<"driver">, HelpText<"Generate a Windows NT Kernel Mode Driver">;
def driver_wdm : F<"driver:wdm">,
HelpText<"Set IMAGE_FILE_UP_SYSTEM_ONLY bit in PE header">;
def driver_uponly : F<"driver:uponly">,
def driver_wdm_uponly : F<"driver:wdm,uponly">;
def driver_uponly_wdm : F<"driver:uponly,wdm">;
def nodefaultlib_all : F<"nodefaultlib">,
HelpText<"Remove all default libraries">;
def noentry : F<"noentry">,
HelpText<"Don't add reference to DllMainCRTStartup; only valid with /dll">;
def profile : F<"profile">;
def repro : F<"Brepro">,
HelpText<"Use a hash of the executable as the PE header timestamp">;
def reproduce : P<"reproduce",
"Dump linker invocation and input files for debugging">;
def swaprun : P<"swaprun",
"Comma-separated list of 'cd' or 'net'">;
def swaprun_cd : F<"swaprun:cd">, Alias<swaprun>, AliasArgs<["cd"]>,
HelpText<"Make loader run output binary from swap instead of from CD">;
def swaprun_net : F<"swaprun:net">, Alias<swaprun>, AliasArgs<["net"]>,
HelpText<"Make loader run output binary from swap instead of from network">;
def verbose : F<"verbose">;
def wholearchive_flag : F<"wholearchive">,
HelpText<"Include all object files from all libraries">;
def force : F<"force">,
HelpText<"Allow undefined and multiply defined symbols">;
def force_unresolved : F<"force:unresolved">,
HelpText<"Allow undefined symbols when creating executables">;
def force_multiple : F<"force:multiple">,
HelpText<"Allow multiply defined symbols when creating executables">;
def force_multipleres : F<"force:multipleres">,
HelpText<"Allow multiply defined resources when creating executables">;
defm WX : B<"WX", "Treat warnings as errors", "Don't treat warnings as errors">;
defm allowbind : B<"allowbind", "Enable DLL binding (default)",
"Disable DLL binding">;
defm allowisolation : B<"allowisolation", "Enable DLL isolation (default)",
"Disable DLL isolation">;
defm appcontainer : B<"appcontainer",
"Image can only be run in an app container",
"Image can run outside an app container (default)">;
defm cetcompat : B<"cetcompat", "Mark executable image as compatible with Control-flow Enforcement Technology (CET) Shadow Stack",
"Don't mark executable image as compatible with Control-flow Enforcement Technology (CET) Shadow Stack (default)">;
defm dynamicbase : B<"dynamicbase", "Enable ASLR (default unless /fixed)",
"Disable ASLR (default when /fixed)">;
defm fixed : B<"fixed", "Disable base relocations",
"Enable base relocations (default)">;
defm highentropyva : B<"highentropyva",
"Enable 64-bit ASLR (default on 64-bit)",
"Disable 64-bit ASLR">;
defm incremental : B<"incremental",
"Keep original import library if contents are unchanged",
"Overwrite import library even if contents are unchanged">;
defm integritycheck : B<"integritycheck",
"Set FORCE_INTEGRITY bit in PE header",
"No effect (default)">;
defm largeaddressaware : B<"largeaddressaware",
"Enable large addresses (default on 64-bit)",
"Disable large addresses (default on 32-bit)">;
defm nxcompat : B<"nxcompat", "Enable data execution prevention (default)",
"Disable data execution provention">;
defm safeseh : B<"safeseh",
"Produce an image with Safe Exception Handler (only for x86)",
"Don't produce an image with Safe Exception Handler">;
defm tsaware : B<"tsaware",
"Create Terminal Server aware executable (default)",
"Create non-Terminal Server aware executable">;
def help : F<"help">;
// /?? and -?? must be before /? and -? to not confuse lib/Options.
def help_q : Flag<["/??", "-??", "/?", "-?"], "">, Alias<help>;
// LLD extensions
defm auto_import : B_priv<"auto-import">;
defm runtime_pseudo_reloc : B_priv<"runtime-pseudo-reloc">;
def end_lib : F<"end-lib">,
HelpText<"Ends group of objects treated as if they were in a library">;
def exclude_all_symbols : F<"exclude-all-symbols">;
def export_all_symbols : F<"export-all-symbols">;
defm demangle : B<"demangle",
"Demangle symbols in output (default)",
"Do not demangle symbols in output">;
def include_optional : Joined<["/", "-", "/?", "-?"], "includeoptional:">,
HelpText<"Add symbol as undefined, but allow it to remain undefined">;
def kill_at : F<"kill-at">;
def lldmingw : F<"lldmingw">;
+def noseh : F<"noseh">;
def output_def : Joined<["/", "-", "/?", "-?"], "output-def:">;
def pdb_source_path : P<"pdbsourcepath",
"Base path used to make relative source file path absolute in PDB">;
def rsp_quoting : Joined<["--"], "rsp-quoting=">,
HelpText<"Quoting style for response files, 'windows' (default) or 'posix'">;
def start_lib : F<"start-lib">,
HelpText<"Starts group of objects treated as if they were in a library">;
def thinlto_emit_imports_files :
HelpText<"Emit .imports files with -thinlto-index-only">;
def thinlto_index_only :
HelpText<"Instead of linking, emit ThinLTO index files">;
def thinlto_index_only_arg : P<
"-thinlto-index-only and also write native module names to file">;
def thinlto_object_suffix_replace : P<
"'old;new' replace old suffix with new suffix in ThinLTO index">;
def thinlto_prefix_replace: P<
"'old;new' replace old prefix with new prefix in ThinLTO outputs">;
def lto_obj_path : P<
"output native object for merged LTO unit to this path">;
def dash_dash_version : Flag<["--"], "version">,
HelpText<"Print version information">;
def threads
: P<"threads", "Number of threads. '1' disables multi-threading. By "
"default all available hardware threads are used">;
// Flags for debugging
def lldmap : F<"lldmap">;
def lldmap_file : Joined<["/", "-", "/?", "-?"], "lldmap:">;
def map : F<"map">;
def map_file : Joined<["/", "-", "/?", "-?"], "map:">;
def show_timing : F<"time">;
def summary : F<"summary">;
// The flags below do nothing. They are defined only for link.exe compatibility.
class QF<string name> : Joined<["/", "-", "/?", "-?"], name#":">;
def ignoreidl : F<"ignoreidl">;
def nologo : F<"nologo">;
def throwingnew : F<"throwingnew">;
def editandcontinue : F<"editandcontinue">;
def fastfail : F<"fastfail">;
def delay : QF<"delay">;
def errorreport : QF<"errorreport">;
def idlout : QF<"idlout">;
def maxilksize : QF<"maxilksize">;
def tlbid : QF<"tlbid">;
def tlbout : QF<"tlbout">;
def verbose_all : QF<"verbose">;
def guardsym : QF<"guardsym">;
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 3bcc1777f7ac..082de5b8c1d6 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1,1992 +1,1992 @@
//===- Writer.cpp ---------------------------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "Writer.h"
#include "Config.h"
#include "DLL.h"
#include "InputFiles.h"
#include "LLDMapFile.h"
#include "MapFile.h"
#include "PDB.h"
#include "SymbolTable.h"
#include "Symbols.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Memory.h"
#include "lld/Common/Timer.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/BinaryStreamReader.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/FileOutputBuffer.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/RandomNumberGenerator.h"
#include "llvm/Support/xxhash.h"
#include <algorithm>
#include <cstdio>
#include <map>
#include <memory>
#include <utility>
using namespace llvm;
using namespace llvm::COFF;
using namespace llvm::object;
using namespace llvm::support;
using namespace llvm::support::endian;
using namespace lld;
using namespace lld::coff;
/* To re-generate DOSProgram:
$ cat > /tmp/DOSProgram.asm
org 0
; Copy cs to ds.
push cs
pop ds
; Point ds:dx at the $-terminated string.
mov dx, str
; Int 21/AH=09h: Write string to standard output.
mov ah, 0x9
int 0x21
; Int 21/AH=4Ch: Exit with return code (in AL).
mov ax, 0x4C01
int 0x21
db 'This program cannot be run in DOS mode.$'
align 8, db 0
$ nasm -fbin /tmp/DOSProgram.asm -o /tmp/DOSProgram.bin
$ xxd -i /tmp/DOSProgram.bin
static unsigned char dosProgram[] = {
0x0e, 0x1f, 0xba, 0x0e, 0x00, 0xb4, 0x09, 0xcd, 0x21, 0xb8, 0x01, 0x4c,
0xcd, 0x21, 0x54, 0x68, 0x69, 0x73, 0x20, 0x70, 0x72, 0x6f, 0x67, 0x72,
0x61, 0x6d, 0x20, 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x20, 0x62, 0x65,
0x20, 0x72, 0x75, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x44, 0x4f, 0x53, 0x20,
0x6d, 0x6f, 0x64, 0x65, 0x2e, 0x24, 0x00, 0x00
static_assert(sizeof(dosProgram) % 8 == 0,
"DOSProgram size must be multiple of 8");
static const int dosStubSize = sizeof(dos_header) + sizeof(dosProgram);
static_assert(dosStubSize % 8 == 0, "DOSStub size must be multiple of 8");
static const int numberOfDataDirectory = 16;
// Global vector of all output sections. After output sections are finalized,
// this can be indexed by Chunk::getOutputSection.
static std::vector<OutputSection *> outputSections;
OutputSection *Chunk::getOutputSection() const {
return osidx == 0 ? nullptr : outputSections[osidx - 1];
namespace {
class DebugDirectoryChunk : public NonSectionChunk {
DebugDirectoryChunk(const std::vector<std::pair<COFF::DebugType, Chunk *>> &r,
bool writeRepro)
: records(r), writeRepro(writeRepro) {}
size_t getSize() const override {
return (records.size() + int(writeRepro)) * sizeof(debug_directory);
void writeTo(uint8_t *b) const override {
auto *d = reinterpret_cast<debug_directory *>(b);
for (const std::pair<COFF::DebugType, Chunk *>& record : records) {
Chunk *c = record.second;
OutputSection *os = c->getOutputSection();
uint64_t offs = os->getFileOff() + (c->getRVA() - os->getRVA());
fillEntry(d, record.first, c->getSize(), c->getRVA(), offs);
if (writeRepro) {
// FIXME: The COFF spec allows either a 0-sized entry to just say
// "the timestamp field is really a hash", or a 4-byte size field
// followed by that many bytes containing a longer hash (with the
// lowest 4 bytes usually being the timestamp in little-endian order).
// Consider storing the full 8 bytes computed by xxHash64 here.
fillEntry(d, COFF::IMAGE_DEBUG_TYPE_REPRO, 0, 0, 0);
void setTimeDateStamp(uint32_t timeDateStamp) {
for (support::ulittle32_t *tds : timeDateStamps)
*tds = timeDateStamp;
void fillEntry(debug_directory *d, COFF::DebugType debugType, size_t size,
uint64_t rva, uint64_t offs) const {
d->Characteristics = 0;
d->TimeDateStamp = 0;
d->MajorVersion = 0;
d->MinorVersion = 0;
d->Type = debugType;
d->SizeOfData = size;
d->AddressOfRawData = rva;
d->PointerToRawData = offs;
mutable std::vector<support::ulittle32_t *> timeDateStamps;
const std::vector<std::pair<COFF::DebugType, Chunk *>> &records;
bool writeRepro;
class CVDebugRecordChunk : public NonSectionChunk {
size_t getSize() const override {
return sizeof(codeview::DebugInfo) + config->pdbAltPath.size() + 1;
void writeTo(uint8_t *b) const override {
// Save off the DebugInfo entry to backfill the file signature (build id)
// in Writer::writeBuildId
buildId = reinterpret_cast<codeview::DebugInfo *>(b);
// variable sized field (PDB Path)
char *p = reinterpret_cast<char *>(b + sizeof(*buildId));
if (!config->pdbAltPath.empty())
memcpy(p, config->, config->pdbAltPath.size());
p[config->pdbAltPath.size()] = '\0';
mutable codeview::DebugInfo *buildId = nullptr;
class ExtendedDllCharacteristicsChunk : public NonSectionChunk {
ExtendedDllCharacteristicsChunk(uint32_t c) : characteristics(c) {}
size_t getSize() const override { return 4; }
void writeTo(uint8_t *buf) const override { write32le(buf, characteristics); }
uint32_t characteristics = 0;
// PartialSection represents a group of chunks that contribute to an
// OutputSection. Collating a collection of PartialSections of same name and
// characteristics constitutes the OutputSection.
class PartialSectionKey {
StringRef name;
unsigned characteristics;
bool operator<(const PartialSectionKey &other) const {
int c =;
if (c == 1)
return false;
if (c == 0)
return characteristics < other.characteristics;
return true;
// The writer writes a SymbolTable result to a file.
class Writer {
Writer() : buffer(errorHandler().outputBuffer) {}
void run();
void createSections();
void createMiscChunks();
void createImportTables();
void appendImportThunks();
void locateImportTables();
void createExportTable();
void mergeSections();
void removeUnusedSections();
void assignAddresses();
void finalizeAddresses();
void removeEmptySections();
void assignOutputSectionIndices();
void createSymbolAndStringTable();
void openFile(StringRef outputPath);
template <typename PEHeaderTy> void writeHeader();
void createSEHTable();
void createRuntimePseudoRelocs();
void insertCtorDtorSymbols();
void createGuardCFTables();
void markSymbolsForRVATable(ObjFile *file,
ArrayRef<SectionChunk *> symIdxChunks,
SymbolRVASet &tableSymbols);
void maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
StringRef countSym);
void setSectionPermissions();
void writeSections();
void writeBuildId();
void sortExceptionTable();
void sortCRTSectionChunks(std::vector<Chunk *> &chunks);
void addSyntheticIdata();
void fixPartialSectionChars(StringRef name, uint32_t chars);
bool fixGnuImportChunks();
PartialSection *createPartialSection(StringRef name, uint32_t outChars);
PartialSection *findPartialSection(StringRef name, uint32_t outChars);
llvm::Optional<coff_symbol16> createSymbol(Defined *d);
size_t addEntryToStringTable(StringRef str);
OutputSection *findSection(StringRef name);
void addBaserels();
void addBaserelBlocks(std::vector<Baserel> &v);
uint32_t getSizeOfInitializedData();
std::unique_ptr<FileOutputBuffer> &buffer;
std::map<PartialSectionKey, PartialSection *> partialSections;
std::vector<char> strtab;
std::vector<llvm::object::coff_symbol16> outputSymtab;
IdataContents idata;
Chunk *importTableStart = nullptr;
uint64_t importTableSize = 0;
Chunk *edataStart = nullptr;
Chunk *edataEnd = nullptr;
Chunk *iatStart = nullptr;
uint64_t iatSize = 0;
DelayLoadContents delayIdata;
EdataContents edata;
bool setNoSEHCharacteristic = false;
DebugDirectoryChunk *debugDirectory = nullptr;
std::vector<std::pair<COFF::DebugType, Chunk *>> debugRecords;
CVDebugRecordChunk *buildId = nullptr;
ArrayRef<uint8_t> sectionTable;
uint64_t fileSize;
uint32_t pointerToSymbolTable = 0;
uint64_t sizeOfImage;
uint64_t sizeOfHeaders;
OutputSection *textSec;
OutputSection *rdataSec;
OutputSection *buildidSec;
OutputSection *dataSec;
OutputSection *pdataSec;
OutputSection *idataSec;
OutputSection *edataSec;
OutputSection *didatSec;
OutputSection *rsrcSec;
OutputSection *relocSec;
OutputSection *ctorsSec;
OutputSection *dtorsSec;
// The first and last .pdata sections in the output file.
// We need to keep track of the location of .pdata in whichever section it
// gets merged into so that we can sort its contents and emit a correct data
// directory entry for the exception table. This is also the case for some
// other sections (such as .edata) but because the contents of those sections
// are entirely linker-generated we can keep track of their locations using
// the chunks that the linker creates. All .pdata chunks come from input
// files, so we need to keep track of them separately.
Chunk *firstPdata = nullptr;
Chunk *lastPdata;
} // anonymous namespace
static Timer codeLayoutTimer("Code Layout", Timer::root());
static Timer diskCommitTimer("Commit Output File", Timer::root());
void lld::coff::writeResult() { Writer().run(); }
void OutputSection::addChunk(Chunk *c) {
void OutputSection::insertChunkAtStart(Chunk *c) {
chunks.insert(chunks.begin(), c);
void OutputSection::setPermissions(uint32_t c) {
header.Characteristics &= ~permMask;
header.Characteristics |= c;
void OutputSection::merge(OutputSection *other) {
chunks.insert(chunks.end(), other->chunks.begin(), other->chunks.end());
contribSections.insert(contribSections.end(), other->contribSections.begin(),
// Write the section header to a given buffer.
void OutputSection::writeHeaderTo(uint8_t *buf) {
auto *hdr = reinterpret_cast<coff_section *>(buf);
*hdr = header;
if (stringTableOff) {
// If name is too long, write offset into the string table as a name.
sprintf(hdr->Name, "/%d", stringTableOff);
} else {
assert(!config->debug || name.size() <= COFF::NameSize ||
(hdr->Characteristics & IMAGE_SCN_MEM_DISCARDABLE) == 0);
std::min(name.size(), (size_t)COFF::NameSize));
void OutputSection::addContributingPartialSection(PartialSection *sec) {
// Check whether the target address S is in range from a relocation
// of type relType at address P.
static bool isInRange(uint16_t relType, uint64_t s, uint64_t p, int margin) {
if (config->machine == ARMNT) {
int64_t diff = AbsoluteDifference(s, p + 4) + margin;
switch (relType) {
return isInt<21>(diff);
return isInt<25>(diff);
return true;
} else if (config->machine == ARM64) {
int64_t diff = AbsoluteDifference(s, p) + margin;
switch (relType) {
return isInt<28>(diff);
return isInt<21>(diff);
return isInt<16>(diff);
return true;
} else {
llvm_unreachable("Unexpected architecture");
// Return the last thunk for the given target if it is in range,
// or create a new one.
static std::pair<Defined *, bool>
getThunk(DenseMap<uint64_t, Defined *> &lastThunks, Defined *target, uint64_t p,
uint16_t type, int margin) {
Defined *&lastThunk = lastThunks[target->getRVA()];
if (lastThunk && isInRange(type, lastThunk->getRVA(), p, margin))
return {lastThunk, false};
Chunk *c;
switch (config->machine) {
case ARMNT:
c = make<RangeExtensionThunkARM>(target);
case ARM64:
c = make<RangeExtensionThunkARM64>(target);
llvm_unreachable("Unexpected architecture");
Defined *d = make<DefinedSynthetic>("", c);
lastThunk = d;
return {d, true};
// This checks all relocations, and for any relocation which isn't in range
// it adds a thunk after the section chunk that contains the relocation.
// If the latest thunk for the specific target is in range, that is used
// instead of creating a new thunk. All range checks are done with the
// specified margin, to make sure that relocations that originally are in
// range, but only barely, also get thunks - in case other added thunks makes
// the target go out of range.
// After adding thunks, we verify that all relocations are in range (with
// no extra margin requirements). If this failed, we restart (throwing away
// the previously created thunks) and retry with a wider margin.
static bool createThunks(OutputSection *os, int margin) {
bool addressesChanged = false;
DenseMap<uint64_t, Defined *> lastThunks;
DenseMap<std::pair<ObjFile *, Defined *>, uint32_t> thunkSymtabIndices;
size_t thunksSize = 0;
// Recheck Chunks.size() each iteration, since we can insert more
// elements into it.
for (size_t i = 0; i != os->chunks.size(); ++i) {
SectionChunk *sc = dyn_cast_or_null<SectionChunk>(os->chunks[i]);
if (!sc)
size_t thunkInsertionSpot = i + 1;
// Try to get a good enough estimate of where new thunks will be placed.
// Offset this by the size of the new thunks added so far, to make the
// estimate slightly better.
size_t thunkInsertionRVA = sc->getRVA() + sc->getSize() + thunksSize;
ObjFile *file = sc->file;
std::vector<std::pair<uint32_t, uint32_t>> relocReplacements;
ArrayRef<coff_relocation> originalRelocs =
for (size_t j = 0, e = originalRelocs.size(); j < e; ++j) {
const coff_relocation &rel = originalRelocs[j];
Symbol *relocTarget = file->getSymbol(rel.SymbolTableIndex);
// The estimate of the source address P should be pretty accurate,
// but we don't know whether the target Symbol address should be
// offset by thunksSize or not (or by some of thunksSize but not all of
// it), giving us some uncertainty once we have added one thunk.
uint64_t p = sc->getRVA() + rel.VirtualAddress + thunksSize;
Defined *sym = dyn_cast_or_null<Defined>(relocTarget);
if (!sym)
uint64_t s = sym->getRVA();
if (isInRange(rel.Type, s, p, margin))
// If the target isn't in range, hook it up to an existing or new
// thunk.
Defined *thunk;
bool wasNew;
std::tie(thunk, wasNew) = getThunk(lastThunks, sym, p, rel.Type, margin);
if (wasNew) {
Chunk *thunkChunk = thunk->getChunk();
thunkInsertionRVA); // Estimate of where it will be located.
os->chunks.insert(os->chunks.begin() + thunkInsertionSpot, thunkChunk);
thunksSize += thunkChunk->getSize();
thunkInsertionRVA += thunkChunk->getSize();
addressesChanged = true;
// To redirect the relocation, add a symbol to the parent object file's
// symbol table, and replace the relocation symbol table index with the
// new index.
auto insertion = thunkSymtabIndices.insert({{file, thunk}, ~0U});
uint32_t &thunkSymbolIndex = insertion.first->second;
if (insertion.second)
thunkSymbolIndex = file->addRangeThunkSymbol(thunk);
relocReplacements.push_back({j, thunkSymbolIndex});
// Get a writable copy of this section's relocations so they can be
// modified. If the relocations point into the object file, allocate new
// memory. Otherwise, this must be previously allocated memory that can be
// modified in place.
ArrayRef<coff_relocation> curRelocs = sc->getRelocs();
MutableArrayRef<coff_relocation> newRelocs;
if ( == {
newRelocs = makeMutableArrayRef(
} else {
newRelocs = makeMutableArrayRef(
const_cast<coff_relocation *>(, curRelocs.size());
// Copy each relocation, but replace the symbol table indices which need
// thunks.
auto nextReplacement = relocReplacements.begin();
auto endReplacement = relocReplacements.end();
for (size_t i = 0, e = originalRelocs.size(); i != e; ++i) {
newRelocs[i] = originalRelocs[i];
if (nextReplacement != endReplacement && nextReplacement->first == i) {
newRelocs[i].SymbolTableIndex = nextReplacement->second;
return addressesChanged;
// Verify that all relocations are in range, with no extra margin requirements.
static bool verifyRanges(const std::vector<Chunk *> chunks) {
for (Chunk *c : chunks) {
SectionChunk *sc = dyn_cast_or_null<SectionChunk>(c);
if (!sc)
ArrayRef<coff_relocation> relocs = sc->getRelocs();
for (size_t j = 0, e = relocs.size(); j < e; ++j) {
const coff_relocation &rel = relocs[j];
Symbol *relocTarget = sc->file->getSymbol(rel.SymbolTableIndex);
Defined *sym = dyn_cast_or_null<Defined>(relocTarget);
if (!sym)
uint64_t p = sc->getRVA() + rel.VirtualAddress;
uint64_t s = sym->getRVA();
if (!isInRange(rel.Type, s, p, 0))
return false;
return true;
// Assign addresses and add thunks if necessary.
void Writer::finalizeAddresses() {
if (config->machine != ARMNT && config->machine != ARM64)
size_t origNumChunks = 0;
for (OutputSection *sec : outputSections) {
sec->origChunks = sec->chunks;
origNumChunks += sec->chunks.size();
int pass = 0;
int margin = 1024 * 100;
while (true) {
// First check whether we need thunks at all, or if the previous pass of
// adding them turned out ok.
bool rangesOk = true;
size_t numChunks = 0;
for (OutputSection *sec : outputSections) {
if (!verifyRanges(sec->chunks)) {
rangesOk = false;
numChunks += sec->chunks.size();
if (rangesOk) {
if (pass > 0)
log("Added " + Twine(numChunks - origNumChunks) + " thunks with " +
"margin " + Twine(margin) + " in " + Twine(pass) + " passes");
if (pass >= 10)
fatal("adding thunks hasn't converged after " + Twine(pass) + " passes");
if (pass > 0) {
// If the previous pass didn't work out, reset everything back to the
// original conditions before retrying with a wider margin. This should
// ideally never happen under real circumstances.
for (OutputSection *sec : outputSections)
sec->chunks = sec->origChunks;
margin *= 2;
// Try adding thunks everywhere where it is needed, with a margin
// to avoid things going out of range due to the added thunks.
bool addressesChanged = false;
for (OutputSection *sec : outputSections)
addressesChanged |= createThunks(sec, margin);
// If the verification above thought we needed thunks, we should have
// added some.
// Recalculate the layout for the whole image (and verify the ranges at
// the start of the next round).
// The main function of the writer.
void Writer::run() {
ScopedTimer t1(codeLayoutTimer);
if (fileSize > UINT32_MAX)
fatal("image size (" + Twine(fileSize) + ") " +
"exceeds maximum allowable size (" + Twine(UINT32_MAX) + ")");
if (config->is64()) {
} else {
if (!config->pdbPath.empty() && config->debug) {
createPDB(symtab, outputSections, sectionTable, buildId->buildId);
if (errorCount())
ScopedTimer t2(diskCommitTimer);
if (auto e = buffer->commit())
fatal("failed to write the output file: " + toString(std::move(e)));
static StringRef getOutputSectionName(StringRef name) {
StringRef s = name.split('$').first;
// Treat a later period as a separator for MinGW, for sections like
// ".ctors.01234".
return s.substr(0, s.find('.', 1));
// For /order.
static void sortBySectionOrder(std::vector<Chunk *> &chunks) {
auto getPriority = [](const Chunk *c) {
if (auto *sec = dyn_cast<SectionChunk>(c))
if (sec->sym)
return config->order.lookup(sec->sym->getName());
return 0;
llvm::stable_sort(chunks, [=](const Chunk *a, const Chunk *b) {
return getPriority(a) < getPriority(b);
// Change the characteristics of existing PartialSections that belong to the
// section Name to Chars.
void Writer::fixPartialSectionChars(StringRef name, uint32_t chars) {
for (auto it : partialSections) {
PartialSection *pSec = it.second;
StringRef curName = pSec->name;
if (!curName.consume_front(name) ||
(!curName.empty() && !curName.startswith("$")))
if (pSec->characteristics == chars)
PartialSection *destSec = createPartialSection(pSec->name, chars);
destSec->chunks.insert(destSec->chunks.end(), pSec->chunks.begin(),
// Sort concrete section chunks from GNU import libraries.
// GNU binutils doesn't use short import files, but instead produces import
// libraries that consist of object files, with section chunks for the .idata$*
// sections. These are linked just as regular static libraries. Each import
// library consists of one header object, one object file for every imported
// symbol, and one trailer object. In order for the .idata tables/lists to
// be formed correctly, the section chunks within each .idata$* section need
// to be grouped by library, and sorted alphabetically within each library
// (which makes sure the header comes first and the trailer last).
bool Writer::fixGnuImportChunks() {
// Make sure all .idata$* section chunks are mapped as RDATA in order to
// be sorted into the same sections as our own synthesized .idata chunks.
fixPartialSectionChars(".idata", rdata);
bool hasIdata = false;
// Sort all .idata$* chunks, grouping chunks from the same library,
// with alphabetical ordering of the object fils within a library.
for (auto it : partialSections) {
PartialSection *pSec = it.second;
if (!pSec->name.startswith(".idata"))
if (!pSec->chunks.empty())
hasIdata = true;
llvm::stable_sort(pSec->chunks, [&](Chunk *s, Chunk *t) {
SectionChunk *sc1 = dyn_cast_or_null<SectionChunk>(s);
SectionChunk *sc2 = dyn_cast_or_null<SectionChunk>(t);
if (!sc1 || !sc2) {
// if SC1, order them ascending. If SC2 or both null,
// S is not less than T.
return sc1 != nullptr;
// Make a string with "libraryname/objectfile" for sorting, achieving
// both grouping by library and sorting of objects within a library,
// at once.
std::string key1 =
(sc1->file->parentName + "/" + sc1->file->getName()).str();
std::string key2 =
(sc2->file->parentName + "/" + sc2->file->getName()).str();
return key1 < key2;
return hasIdata;
// Add generated idata chunks, for imported symbols and DLLs, and a
// terminator in .idata$2.
void Writer::addSyntheticIdata() {
// Add the .idata content in the right section groups, to allow
// chunks from other linked in object files to be grouped together.
// See Microsoft PE/COFF spec 5.4 for details.
auto add = [&](StringRef n, std::vector<Chunk *> &v) {
PartialSection *pSec = createPartialSection(n, rdata);
pSec->chunks.insert(pSec->chunks.end(), v.begin(), v.end());
// The loader assumes a specific order of data.
// Add each type in the correct order.
add(".idata$2", idata.dirs);
add(".idata$4", idata.lookups);
add(".idata$5", idata.addresses);
if (!idata.hints.empty())
add(".idata$6", idata.hints);
add(".idata$7", idata.dllNames);
// Locate the first Chunk and size of the import directory list and the
// IAT.
void Writer::locateImportTables() {
if (PartialSection *importDirs = findPartialSection(".idata$2", rdata)) {
if (!importDirs->chunks.empty())
importTableStart = importDirs->chunks.front();
for (Chunk *c : importDirs->chunks)
importTableSize += c->getSize();
if (PartialSection *importAddresses = findPartialSection(".idata$5", rdata)) {
if (!importAddresses->chunks.empty())
iatStart = importAddresses->chunks.front();
for (Chunk *c : importAddresses->chunks)
iatSize += c->getSize();
// Return whether a SectionChunk's suffix (the dollar and any trailing
// suffix) should be removed and sorted into the main suffixless
// PartialSection.
static bool shouldStripSectionSuffix(SectionChunk *sc, StringRef name) {
// On MinGW, comdat groups are formed by putting the comdat group name
// after the '$' in the section name. For .eh_frame$<symbol>, that must
// still be sorted before the .eh_frame trailer from crtend.o, thus just
// strip the section name trailer. For other sections, such as
// .tls$$<symbol> (where non-comdat .tls symbols are otherwise stored in
// ".tls$"), they must be strictly sorted after .tls. And for the
// hypothetical case of comdat .CRT$XCU, we definitely need to keep the
// suffix for sorting. Thus, to play it safe, only strip the suffix for
// the standard sections.
if (!config->mingw)
return false;
if (!sc || !sc->isCOMDAT())
return false;
return name.startswith(".text$") || name.startswith(".data$") ||
name.startswith(".rdata$") || name.startswith(".pdata$") ||
name.startswith(".xdata$") || name.startswith(".eh_frame$");
// Create output section objects and add them to OutputSections.
void Writer::createSections() {
// First, create the builtin sections.
const uint32_t data = IMAGE_SCN_CNT_INITIALIZED_DATA;
const uint32_t code = IMAGE_SCN_CNT_CODE;
const uint32_t discardable = IMAGE_SCN_MEM_DISCARDABLE;
const uint32_t r = IMAGE_SCN_MEM_READ;
const uint32_t w = IMAGE_SCN_MEM_WRITE;
const uint32_t x = IMAGE_SCN_MEM_EXECUTE;
SmallDenseMap<std::pair<StringRef, uint32_t>, OutputSection *> sections;
auto createSection = [&](StringRef name, uint32_t outChars) {
OutputSection *&sec = sections[{name, outChars}];
if (!sec) {
sec = make<OutputSection>(name, outChars);
return sec;
// Try to match the section order used by link.exe.
textSec = createSection(".text", code | r | x);
createSection(".bss", bss | r | w);
rdataSec = createSection(".rdata", data | r);
buildidSec = createSection(".buildid", data | r);
dataSec = createSection(".data", data | r | w);
pdataSec = createSection(".pdata", data | r);
idataSec = createSection(".idata", data | r);
edataSec = createSection(".edata", data | r);
didatSec = createSection(".didat", data | r);
rsrcSec = createSection(".rsrc", data | r);
relocSec = createSection(".reloc", data | discardable | r);
ctorsSec = createSection(".ctors", data | r | w);
dtorsSec = createSection(".dtors", data | r | w);
// Then bin chunks by name and output characteristics.
for (Chunk *c : symtab->getChunks()) {
auto *sc = dyn_cast<SectionChunk>(c);
if (sc && !sc->live) {
if (config->verbose)
StringRef name = c->getSectionName();
if (shouldStripSectionSuffix(sc, name))
name = name.split('$').first;
PartialSection *pSec = createPartialSection(name,
fixPartialSectionChars(".rsrc", data | r);
fixPartialSectionChars(".edata", data | r);
// Even in non MinGW cases, we might need to link against GNU import
// libraries.
bool hasIdata = fixGnuImportChunks();
if (!idata.empty())
hasIdata = true;
if (hasIdata)
// Process an /order option.
if (!config->order.empty())
for (auto it : partialSections)
if (hasIdata)
// Then create an OutputSection for each section.
// '$' and all following characters in input section names are
// discarded when determining output section. So, .text$foo
// contributes to .text, for example. See PE/COFF spec 3.2.
for (auto it : partialSections) {
PartialSection *pSec = it.second;
StringRef name = getOutputSectionName(pSec->name);
uint32_t outChars = pSec->characteristics;
if (name == ".CRT") {
// In link.exe, there is a special case for the I386 target where .CRT
// sections are treated as if they have output characteristics DATA | R if
// their characteristics are DATA | R | W. This implements the same
// special case for all architectures.
outChars = data | r;
log("Processing section " + pSec->name + " -> " + name);
OutputSection *sec = createSection(name, outChars);
for (Chunk *c : pSec->chunks)
// Finally, move some output sections to the end.
auto sectionOrder = [&](const OutputSection *s) {
// Move DISCARDABLE (or non-memory-mapped) sections to the end of file
// because the loader cannot handle holes. Stripping can remove other
// discardable ones than .reloc, which is first of them (created early).
if (s->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE)
return 2;
// .rsrc should come at the end of the non-discardable sections because its
// size may change by the Win32 UpdateResources() function, causing
// subsequent sections to move (see
if (s == rsrcSec)
return 1;
return 0;
[&](const OutputSection *s, const OutputSection *t) {
return sectionOrder(s) < sectionOrder(t);
void Writer::createMiscChunks() {
for (MergeChunk *p : MergeChunk::instances) {
if (p) {
// Create thunks for locally-dllimported symbols.
if (!symtab->localImportChunks.empty()) {
for (Chunk *c : symtab->localImportChunks)
// Create Debug Information Chunks
OutputSection *debugInfoSec = config->mingw ? buildidSec : rdataSec;
if (config->debug || config->repro || config->cetCompat) {
debugDirectory = make<DebugDirectoryChunk>(debugRecords, config->repro);
if (config->debug) {
// Make a CVDebugRecordChunk even when /DEBUG:CV is not specified. We
// output a PDB no matter what, and this chunk provides the only means of
// allowing a debugger to match a PDB and an executable. So we need it even
// if we're ultimately not going to write CodeView data to the PDB.
buildId = make<CVDebugRecordChunk>();
debugRecords.push_back({COFF::IMAGE_DEBUG_TYPE_CODEVIEW, buildId});
if (config->cetCompat) {
ExtendedDllCharacteristicsChunk *extendedDllChars =
if (debugRecords.size() > 0) {
for (std::pair<COFF::DebugType, Chunk *> r : debugRecords)
// Create SEH table. x86-only.
if (config->safeSEH)
// Create /guard:cf tables if requested.
if (config->guardCF != GuardCFLevel::Off)
if (config->autoImport)
if (config->mingw)
// Create .idata section for the DLL-imported symbol table.
// The format of this section is inherently Windows-specific.
// IdataContents class abstracted away the details for us,
// so we just let it create chunks and add them to the section.
void Writer::createImportTables() {
// Initialize DLLOrder so that import entries are ordered in
// the same order as in the command line. (That affects DLL
// initialization order, and this ordering is MSVC-compatible.)
for (ImportFile *file : ImportFile::instances) {
if (!file->live)
std::string dll = StringRef(file->dllName).lower();
if (config->dllOrder.count(dll) == 0)
config->dllOrder[dll] = config->dllOrder.size();
if (file->impSym && !isa<DefinedImportData>(file->impSym))
fatal(toString(*file->impSym) + " was replaced");
DefinedImportData *impSym = cast_or_null<DefinedImportData>(file->impSym);
if (config->delayLoads.count(StringRef(file->dllName).lower())) {
if (!file->thunkSym)
fatal("cannot delay-load " + toString(file) +
" due to import of data: " + toString(*impSym));
} else {
void Writer::appendImportThunks() {
if (ImportFile::instances.empty())
for (ImportFile *file : ImportFile::instances) {
if (!file->live)
if (!file->thunkSym)
if (!isa<DefinedImportThunk>(file->thunkSym))
fatal(toString(*file->thunkSym) + " was replaced");
DefinedImportThunk *thunk = cast<DefinedImportThunk>(file->thunkSym);
if (file->thunkLive)
if (!delayIdata.empty()) {
Defined *helper = cast<Defined>(config->delayLoadHelper);
for (Chunk *c : delayIdata.getChunks())
for (Chunk *c : delayIdata.getDataChunks())
for (Chunk *c : delayIdata.getCodeChunks())
void Writer::createExportTable() {
if (!edataSec->chunks.empty()) {
// Allow using a custom built export table from input object files, instead
// of having the linker synthesize the tables.
if (config->hadExplicitExports)
warn("literal .edata sections override exports");
} else if (!config->exports.empty()) {
for (Chunk *c : edata.chunks)
if (!edataSec->chunks.empty()) {
edataStart = edataSec->chunks.front();
edataEnd = edataSec->chunks.back();
void Writer::removeUnusedSections() {
// Remove sections that we can be sure won't get content, to avoid
// allocating space for their section headers.
auto isUnused = [this](OutputSection *s) {
if (s == relocSec)
return false; // This section is populated later.
// MergeChunks have zero size at this point, as their size is finalized
// later. Only remove sections that have no Chunks at all.
return s->chunks.empty();
std::remove_if(outputSections.begin(), outputSections.end(), isUnused),
// The Windows loader doesn't seem to like empty sections,
// so we remove them if any.
void Writer::removeEmptySections() {
auto isEmpty = [](OutputSection *s) { return s->getVirtualSize() == 0; };
std::remove_if(outputSections.begin(), outputSections.end(), isEmpty),
void Writer::assignOutputSectionIndices() {
// Assign final output section indices, and assign each chunk to its output
// section.
uint32_t idx = 1;
for (OutputSection *os : outputSections) {
os->sectionIndex = idx;
for (Chunk *c : os->chunks)
// Merge chunks are containers of chunks, so assign those an output section
// too.
for (MergeChunk *mc : MergeChunk::instances)
if (mc)
for (SectionChunk *sc : mc->sections)
if (sc && sc->live)
size_t Writer::addEntryToStringTable(StringRef str) {
assert(str.size() > COFF::NameSize);
size_t offsetOfEntry = strtab.size() + 4; // +4 for the size field
strtab.insert(strtab.end(), str.begin(), str.end());
return offsetOfEntry;
Optional<coff_symbol16> Writer::createSymbol(Defined *def) {
coff_symbol16 sym;
switch (def->kind()) {
case Symbol::DefinedAbsoluteKind:
sym.Value = def->getRVA();
sym.SectionNumber = IMAGE_SYM_ABSOLUTE;
case Symbol::DefinedSyntheticKind:
// Relative symbols are unrepresentable in a COFF symbol table.
return None;
default: {
// Don't write symbols that won't be written to the output to the symbol
// table.
Chunk *c = def->getChunk();
if (!c)
return None;
OutputSection *os = c->getOutputSection();
if (!os)
return None;
sym.Value = def->getRVA() - os->getRVA();
sym.SectionNumber = os->sectionIndex;
// Symbols that are runtime pseudo relocations don't point to the actual
// symbol data itself (as they are imported), but points to the IAT entry
// instead. Avoid emitting them to the symbol table, as they can confuse
// debuggers.
if (def->isRuntimePseudoReloc)
return None;
StringRef name = def->getName();
if (name.size() > COFF::NameSize) {
sym.Name.Offset.Zeroes = 0;
sym.Name.Offset.Offset = addEntryToStringTable(name);
} else {
memset(sym.Name.ShortName, 0, COFF::NameSize);
memcpy(sym.Name.ShortName,, name.size());
if (auto *d = dyn_cast<DefinedCOFF>(def)) {
COFFSymbolRef ref = d->getCOFFSymbol();
sym.Type = ref.getType();
sym.StorageClass = ref.getStorageClass();
} else {
sym.NumberOfAuxSymbols = 0;
return sym;
void Writer::createSymbolAndStringTable() {
// PE/COFF images are limited to 8 byte section names. Longer names can be
// supported by writing a non-standard string table, but this string table is
// not mapped at runtime and the long names will therefore be inaccessible.
// link.exe always truncates section names to 8 bytes, whereas binutils always
// preserves long section names via the string table. LLD adopts a hybrid
// solution where discardable sections have long names preserved and
// non-discardable sections have their names truncated, to ensure that any
// section which is mapped at runtime also has its name mapped at runtime.
for (OutputSection *sec : outputSections) {
if (sec->name.size() <= COFF::NameSize)
if ((sec->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE) == 0)
if (config->warnLongSectionNames) {
warn("section name " + sec->name +
" is longer than 8 characters and will use a non-standard string "
if (config->debugDwarf || config->debugSymtab) {
for (ObjFile *file : ObjFile::instances) {
for (Symbol *b : file->getSymbols()) {
auto *d = dyn_cast_or_null<Defined>(b);
if (!d || d->writtenToSymtab)
d->writtenToSymtab = true;
if (Optional<coff_symbol16> sym = createSymbol(d))
if (outputSymtab.empty() && strtab.empty())
// We position the symbol table to be adjacent to the end of the last section.
uint64_t fileOff = fileSize;
pointerToSymbolTable = fileOff;
fileOff += outputSymtab.size() * sizeof(coff_symbol16);
fileOff += 4 + strtab.size();
fileSize = alignTo(fileOff, config->fileAlign);
void Writer::mergeSections() {
if (!pdataSec->chunks.empty()) {
firstPdata = pdataSec->chunks.front();
lastPdata = pdataSec->chunks.back();
for (auto &p : config->merge) {
StringRef toName = p.second;
if (p.first == toName)
StringSet<> names;
while (1) {
if (!names.insert(toName).second)
fatal("/merge: cycle found for section '" + p.first + "'");
auto i = config->merge.find(toName);
if (i == config->merge.end())
toName = i->second;
OutputSection *from = findSection(p.first);
OutputSection *to = findSection(toName);
if (!from)
if (!to) {
from->name = toName;
// Visits all sections to assign incremental, non-overlapping RVAs and
// file offsets.
void Writer::assignAddresses() {
sizeOfHeaders = dosStubSize + sizeof(PEMagic) + sizeof(coff_file_header) +
sizeof(data_directory) * numberOfDataDirectory +
sizeof(coff_section) * outputSections.size();
sizeOfHeaders +=
config->is64() ? sizeof(pe32plus_header) : sizeof(pe32_header);
sizeOfHeaders = alignTo(sizeOfHeaders, config->fileAlign);
fileSize = sizeOfHeaders;
// The first page is kept unmapped.
uint64_t rva = alignTo(sizeOfHeaders, config->align);
for (OutputSection *sec : outputSections) {
if (sec == relocSec)
uint64_t rawSize = 0, virtualSize = 0;
sec->header.VirtualAddress = rva;
// If /FUNCTIONPADMIN is used, functions are padded in order to create a
// hotpatchable image.
const bool isCodeSection =
(sec->header.Characteristics & IMAGE_SCN_CNT_CODE) &&
(sec->header.Characteristics & IMAGE_SCN_MEM_READ) &&
(sec->header.Characteristics & IMAGE_SCN_MEM_EXECUTE);
uint32_t padding = isCodeSection ? config->functionPadMin : 0;
for (Chunk *c : sec->chunks) {
if (padding && c->isHotPatchable())
virtualSize += padding;
virtualSize = alignTo(virtualSize, c->getAlignment());
c->setRVA(rva + virtualSize);
virtualSize += c->getSize();
if (c->hasData)
rawSize = alignTo(virtualSize, config->fileAlign);
if (virtualSize > UINT32_MAX)
error("section larger than 4 GiB: " + sec->name);
sec->header.VirtualSize = virtualSize;
sec->header.SizeOfRawData = rawSize;
if (rawSize != 0)
sec->header.PointerToRawData = fileSize;
rva += alignTo(virtualSize, config->align);
fileSize += alignTo(rawSize, config->fileAlign);
sizeOfImage = alignTo(rva, config->align);
// Assign addresses to sections in MergeChunks.
for (MergeChunk *mc : MergeChunk::instances)
if (mc)
template <typename PEHeaderTy> void Writer::writeHeader() {
// Write DOS header. For backwards compatibility, the first part of a PE/COFF
// executable consists of an MS-DOS MZ executable. If the executable is run
// under DOS, that program gets run (usually to just print an error message).
// When run under Windows, the loader looks at AddressOfNewExeHeader and uses
// the PE header instead.
uint8_t *buf = buffer->getBufferStart();
auto *dos = reinterpret_cast<dos_header *>(buf);
buf += sizeof(dos_header);
dos->Magic[0] = 'M';
dos->Magic[1] = 'Z';
dos->UsedBytesInTheLastPage = dosStubSize % 512;
dos->FileSizeInPages = divideCeil(dosStubSize, 512);
dos->HeaderSizeInParagraphs = sizeof(dos_header) / 16;
dos->AddressOfRelocationTable = sizeof(dos_header);
dos->AddressOfNewExeHeader = dosStubSize;
// Write DOS program.
memcpy(buf, dosProgram, sizeof(dosProgram));
buf += sizeof(dosProgram);
// Write PE magic
memcpy(buf, PEMagic, sizeof(PEMagic));
buf += sizeof(PEMagic);
// Write COFF header
auto *coff = reinterpret_cast<coff_file_header *>(buf);
buf += sizeof(*coff);
coff->Machine = config->machine;
coff->NumberOfSections = outputSections.size();
coff->Characteristics = IMAGE_FILE_EXECUTABLE_IMAGE;
if (config->largeAddressAware)
coff->Characteristics |= IMAGE_FILE_LARGE_ADDRESS_AWARE;
if (!config->is64())
coff->Characteristics |= IMAGE_FILE_32BIT_MACHINE;
if (config->dll)
coff->Characteristics |= IMAGE_FILE_DLL;
if (config->driverUponly)
coff->Characteristics |= IMAGE_FILE_UP_SYSTEM_ONLY;
if (!config->relocatable)
coff->Characteristics |= IMAGE_FILE_RELOCS_STRIPPED;
if (config->swaprunCD)
coff->Characteristics |= IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP;
if (config->swaprunNet)
coff->Characteristics |= IMAGE_FILE_NET_RUN_FROM_SWAP;
coff->SizeOfOptionalHeader =
sizeof(PEHeaderTy) + sizeof(data_directory) * numberOfDataDirectory;
// Write PE header
auto *pe = reinterpret_cast<PEHeaderTy *>(buf);
buf += sizeof(*pe);
pe->Magic = config->is64() ? PE32Header::PE32_PLUS : PE32Header::PE32;
// If {Major,Minor}LinkerVersion is left at 0.0, then for some
// reason signing the resulting PE file with Authenticode produces a
// signature that fails to validate on Windows 7 (but is OK on 10).
// Set it to 14.0, which is what VS2015 outputs, and which avoids
// that problem.
pe->MajorLinkerVersion = 14;
pe->MinorLinkerVersion = 0;
pe->ImageBase = config->imageBase;
pe->SectionAlignment = config->align;
pe->FileAlignment = config->fileAlign;
pe->MajorImageVersion = config->majorImageVersion;
pe->MinorImageVersion = config->minorImageVersion;
pe->MajorOperatingSystemVersion = config->majorOSVersion;
pe->MinorOperatingSystemVersion = config->minorOSVersion;
pe->MajorSubsystemVersion = config->majorOSVersion;
pe->MinorSubsystemVersion = config->minorOSVersion;
pe->Subsystem = config->subsystem;
pe->SizeOfImage = sizeOfImage;
pe->SizeOfHeaders = sizeOfHeaders;
if (!config->noEntry) {
Defined *entry = cast<Defined>(config->entry);
pe->AddressOfEntryPoint = entry->getRVA();
// Pointer to thumb code must have the LSB set, so adjust it.
if (config->machine == ARMNT)
pe->AddressOfEntryPoint |= 1;
pe->SizeOfStackReserve = config->stackReserve;
pe->SizeOfStackCommit = config->stackCommit;
pe->SizeOfHeapReserve = config->heapReserve;
pe->SizeOfHeapCommit = config->heapCommit;
if (config->appContainer)
if (config->driverWdm)
if (config->dynamicBase)
if (config->highEntropyVA)
if (!config->allowBind)
if (config->nxCompat)
if (!config->allowIsolation)
if (config->guardCF != GuardCFLevel::Off)
if (config->integrityCheck)
- if (setNoSEHCharacteristic)
+ if (setNoSEHCharacteristic || config->noSEH)
if (config->terminalServerAware)
pe->NumberOfRvaAndSize = numberOfDataDirectory;
if (textSec->getVirtualSize()) {
pe->BaseOfCode = textSec->getRVA();
pe->SizeOfCode = textSec->getRawSize();
pe->SizeOfInitializedData = getSizeOfInitializedData();
// Write data directory
auto *dir = reinterpret_cast<data_directory *>(buf);
buf += sizeof(*dir) * numberOfDataDirectory;
if (edataStart) {
dir[EXPORT_TABLE].RelativeVirtualAddress = edataStart->getRVA();
dir[EXPORT_TABLE].Size =
edataEnd->getRVA() + edataEnd->getSize() - edataStart->getRVA();
if (importTableStart) {
dir[IMPORT_TABLE].RelativeVirtualAddress = importTableStart->getRVA();
dir[IMPORT_TABLE].Size = importTableSize;
if (iatStart) {
dir[IAT].RelativeVirtualAddress = iatStart->getRVA();
dir[IAT].Size = iatSize;
if (rsrcSec->getVirtualSize()) {
dir[RESOURCE_TABLE].RelativeVirtualAddress = rsrcSec->getRVA();
dir[RESOURCE_TABLE].Size = rsrcSec->getVirtualSize();
if (firstPdata) {
dir[EXCEPTION_TABLE].RelativeVirtualAddress = firstPdata->getRVA();
lastPdata->getRVA() + lastPdata->getSize() - firstPdata->getRVA();
if (relocSec->getVirtualSize()) {
dir[BASE_RELOCATION_TABLE].RelativeVirtualAddress = relocSec->getRVA();
dir[BASE_RELOCATION_TABLE].Size = relocSec->getVirtualSize();
if (Symbol *sym = symtab->findUnderscore("_tls_used")) {
if (Defined *b = dyn_cast<Defined>(sym)) {
dir[TLS_TABLE].RelativeVirtualAddress = b->getRVA();
dir[TLS_TABLE].Size = config->is64()
? sizeof(object::coff_tls_directory64)
: sizeof(object::coff_tls_directory32);
if (debugDirectory) {
dir[DEBUG_DIRECTORY].RelativeVirtualAddress = debugDirectory->getRVA();
dir[DEBUG_DIRECTORY].Size = debugDirectory->getSize();
if (Symbol *sym = symtab->findUnderscore("_load_config_used")) {
if (auto *b = dyn_cast<DefinedRegular>(sym)) {
SectionChunk *sc = b->getChunk();
assert(b->getRVA() >= sc->getRVA());
uint64_t offsetInChunk = b->getRVA() - sc->getRVA();
if (!sc->hasData || offsetInChunk + 4 > sc->getSize())
fatal("_load_config_used is malformed");
ArrayRef<uint8_t> secContents = sc->getContents();
uint32_t loadConfigSize =
*reinterpret_cast<const ulittle32_t *>(&secContents[offsetInChunk]);
if (offsetInChunk + loadConfigSize > sc->getSize())
fatal("_load_config_used is too large");
dir[LOAD_CONFIG_TABLE].RelativeVirtualAddress = b->getRVA();
dir[LOAD_CONFIG_TABLE].Size = loadConfigSize;
if (!delayIdata.empty()) {
dir[DELAY_IMPORT_DESCRIPTOR].RelativeVirtualAddress =
dir[DELAY_IMPORT_DESCRIPTOR].Size = delayIdata.getDirSize();
// Write section table
for (OutputSection *sec : outputSections) {
buf += sizeof(coff_section);
sectionTable = ArrayRef<uint8_t>(
buf - outputSections.size() * sizeof(coff_section), buf);
if (outputSymtab.empty() && strtab.empty())
coff->PointerToSymbolTable = pointerToSymbolTable;
uint32_t numberOfSymbols = outputSymtab.size();
coff->NumberOfSymbols = numberOfSymbols;
auto *symbolTable = reinterpret_cast<coff_symbol16 *>(
buffer->getBufferStart() + coff->PointerToSymbolTable);
for (size_t i = 0; i != numberOfSymbols; ++i)
symbolTable[i] = outputSymtab[i];
// Create the string table, it follows immediately after the symbol table.
// The first 4 bytes is length including itself.
buf = reinterpret_cast<uint8_t *>(&symbolTable[numberOfSymbols]);
write32le(buf, strtab.size() + 4);
if (!strtab.empty())
memcpy(buf + 4,, strtab.size());
void Writer::openFile(StringRef path) {
buffer = CHECK(
FileOutputBuffer::create(path, fileSize, FileOutputBuffer::F_executable),
"failed to open " + path);
void Writer::createSEHTable() {
SymbolRVASet handlers;
for (ObjFile *file : ObjFile::instances) {
if (!file->hasSafeSEH())
error("/safeseh: " + file->getName() + " is not compatible with SEH");
markSymbolsForRVATable(file, file->getSXDataChunks(), handlers);
// Set the "no SEH" characteristic if there really were no handlers, or if
// there is no load config object to point to the table of handlers.
setNoSEHCharacteristic =
handlers.empty() || !symtab->findUnderscore("_load_config_used");
maybeAddRVATable(std::move(handlers), "__safe_se_handler_table",
// Add a symbol to an RVA set. Two symbols may have the same RVA, but an RVA set
// cannot contain duplicates. Therefore, the set is uniqued by Chunk and the
// symbol's offset into that Chunk.
static void addSymbolToRVASet(SymbolRVASet &rvaSet, Defined *s) {
Chunk *c = s->getChunk();
if (auto *sc = dyn_cast<SectionChunk>(c))
c = sc->repl; // Look through ICF replacement.
uint32_t off = s->getRVA() - (c ? c->getRVA() : 0);
rvaSet.insert({c, off});
// Given a symbol, add it to the GFIDs table if it is a live, defined, function
// symbol in an executable section.
static void maybeAddAddressTakenFunction(SymbolRVASet &addressTakenSyms,
Symbol *s) {
if (!s)
switch (s->kind()) {
case Symbol::DefinedLocalImportKind:
case Symbol::DefinedImportDataKind:
// Defines an __imp_ pointer, so it is data, so it is ignored.
case Symbol::DefinedCommonKind:
// Common is always data, so it is ignored.
case Symbol::DefinedAbsoluteKind:
case Symbol::DefinedSyntheticKind:
// Absolute is never code, synthetic generally isn't and usually isn't
// determinable.
case Symbol::LazyArchiveKind:
case Symbol::LazyObjectKind:
case Symbol::UndefinedKind:
// Undefined symbols resolve to zero, so they don't have an RVA. Lazy
// symbols shouldn't have relocations.
case Symbol::DefinedImportThunkKind:
// Thunks are always code, include them.
addSymbolToRVASet(addressTakenSyms, cast<Defined>(s));
case Symbol::DefinedRegularKind: {
// This is a regular, defined, symbol from a COFF file. Mark the symbol as
// address taken if the symbol type is function and it's in an executable
// section.
auto *d = cast<DefinedRegular>(s);
if (d->getCOFFSymbol().getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
SectionChunk *sc = dyn_cast<SectionChunk>(d->getChunk());
if (sc && sc->live &&
sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE)
addSymbolToRVASet(addressTakenSyms, d);
// Visit all relocations from all section contributions of this object file and
// mark the relocation target as address-taken.
static void markSymbolsWithRelocations(ObjFile *file,
SymbolRVASet &usedSymbols) {
for (Chunk *c : file->getChunks()) {
// We only care about live section chunks. Common chunks and other chunks
// don't generally contain relocations.
SectionChunk *sc = dyn_cast<SectionChunk>(c);
if (!sc || !sc->live)
for (const coff_relocation &reloc : sc->getRelocs()) {
if (config->machine == I386 && reloc.Type == COFF::IMAGE_REL_I386_REL32)
// Ignore relative relocations on x86. On x86_64 they can't be ignored
// since they're also used to compute absolute addresses.
Symbol *ref = sc->file->getSymbol(reloc.SymbolTableIndex);
maybeAddAddressTakenFunction(usedSymbols, ref);
// Create the guard function id table. This is a table of RVAs of all
// address-taken functions. It is sorted and uniqued, just like the safe SEH
// table.
void Writer::createGuardCFTables() {
SymbolRVASet addressTakenSyms;
SymbolRVASet longJmpTargets;
for (ObjFile *file : ObjFile::instances) {
// If the object was compiled with /guard:cf, the address taken symbols
// are in .gfids$y sections, and the longjmp targets are in .gljmp$y
// sections. If the object was not compiled with /guard:cf, we assume there
// were no setjmp targets, and that all code symbols with relocations are
// possibly address-taken.
if (file->hasGuardCF()) {
markSymbolsForRVATable(file, file->getGuardFidChunks(), addressTakenSyms);
markSymbolsForRVATable(file, file->getGuardLJmpChunks(), longJmpTargets);
} else {
markSymbolsWithRelocations(file, addressTakenSyms);
// Mark the image entry as address-taken.
if (config->entry)
maybeAddAddressTakenFunction(addressTakenSyms, config->entry);
// Mark exported symbols in executable sections as address-taken.
for (Export &e : config->exports)
maybeAddAddressTakenFunction(addressTakenSyms, e.sym);
// Ensure sections referenced in the gfid table are 16-byte aligned.
for (const ChunkAndOffset &c : addressTakenSyms)
if (c.inputChunk->getAlignment() < 16)
maybeAddRVATable(std::move(addressTakenSyms), "__guard_fids_table",
// Add the longjmp target table unless the user told us not to.
if (config->guardCF == GuardCFLevel::Full)
maybeAddRVATable(std::move(longJmpTargets), "__guard_longjmp_table",
// Set __guard_flags, which will be used in the load config to indicate that
// /guard:cf was enabled.
uint32_t guardFlags = uint32_t(coff_guard_flags::CFInstrumented) |
if (config->guardCF == GuardCFLevel::Full)
guardFlags |= uint32_t(coff_guard_flags::HasLongJmpTable);
Symbol *flagSym = symtab->findUnderscore("__guard_flags");
// Take a list of input sections containing symbol table indices and add those
// symbols to an RVA table. The challenge is that symbol RVAs are not known and
// depend on the table size, so we can't directly build a set of integers.
void Writer::markSymbolsForRVATable(ObjFile *file,
ArrayRef<SectionChunk *> symIdxChunks,
SymbolRVASet &tableSymbols) {
for (SectionChunk *c : symIdxChunks) {
// Skip sections discarded by linker GC. This comes up when a .gfids section
// is associated with something like a vtable and the vtable is discarded.
// In this case, the associated gfids section is discarded, and we don't
// mark the virtual member functions as address-taken by the vtable.
if (!c->live)
// Validate that the contents look like symbol table indices.
ArrayRef<uint8_t> data = c->getContents();
if (data.size() % 4 != 0) {
warn("ignoring " + c->getSectionName() +
" symbol table index section in object " + toString(file));
// Read each symbol table index and check if that symbol was included in the
// final link. If so, add it to the table symbol set.
ArrayRef<ulittle32_t> symIndices(
reinterpret_cast<const ulittle32_t *>(, data.size() / 4);
ArrayRef<Symbol *> objSymbols = file->getSymbols();
for (uint32_t symIndex : symIndices) {
if (symIndex >= objSymbols.size()) {
warn("ignoring invalid symbol table index in section " +
c->getSectionName() + " in object " + toString(file));
if (Symbol *s = objSymbols[symIndex]) {
if (s->isLive())
addSymbolToRVASet(tableSymbols, cast<Defined>(s));
// Replace the absolute table symbol with a synthetic symbol pointing to
// tableChunk so that we can emit base relocations for it and resolve section
// relative relocations.
void Writer::maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
StringRef countSym) {
if (tableSymbols.empty())
RVATableChunk *tableChunk = make<RVATableChunk>(std::move(tableSymbols));
Symbol *t = symtab->findUnderscore(tableSym);
Symbol *c = symtab->findUnderscore(countSym);
replaceSymbol<DefinedSynthetic>(t, t->getName(), tableChunk);
cast<DefinedAbsolute>(c)->setVA(tableChunk->getSize() / 4);
// MinGW specific. Gather all relocations that are imported from a DLL even
// though the code didn't expect it to, produce the table that the runtime
// uses for fixing them up, and provide the synthetic symbols that the
// runtime uses for finding the table.
void Writer::createRuntimePseudoRelocs() {
std::vector<RuntimePseudoReloc> rels;
for (Chunk *c : symtab->getChunks()) {
auto *sc = dyn_cast<SectionChunk>(c);
if (!sc || !sc->live)
if (!config->pseudoRelocs) {
// Not writing any pseudo relocs; if some were needed, error out and
// indicate what required them.
for (const RuntimePseudoReloc &rpr : rels)
error("automatic dllimport of " + rpr.sym->getName() + " in " +
toString(>file) + " requires pseudo relocations");
if (!rels.empty())
log("Writing " + Twine(rels.size()) + " runtime pseudo relocations");
PseudoRelocTableChunk *table = make<PseudoRelocTableChunk>(rels);
EmptyChunk *endOfList = make<EmptyChunk>();
Symbol *headSym = symtab->findUnderscore("__RUNTIME_PSEUDO_RELOC_LIST__");
Symbol *endSym = symtab->findUnderscore("__RUNTIME_PSEUDO_RELOC_LIST_END__");
replaceSymbol<DefinedSynthetic>(headSym, headSym->getName(), table);
replaceSymbol<DefinedSynthetic>(endSym, endSym->getName(), endOfList);
// MinGW specific.
// The MinGW .ctors and .dtors lists have sentinels at each end;
// a (uintptr_t)-1 at the start and a (uintptr_t)0 at the end.
// There's a symbol pointing to the start sentinel pointer, __CTOR_LIST__
// and __DTOR_LIST__ respectively.
void Writer::insertCtorDtorSymbols() {
AbsolutePointerChunk *ctorListHead = make<AbsolutePointerChunk>(-1);
AbsolutePointerChunk *ctorListEnd = make<AbsolutePointerChunk>(0);
AbsolutePointerChunk *dtorListHead = make<AbsolutePointerChunk>(-1);
AbsolutePointerChunk *dtorListEnd = make<AbsolutePointerChunk>(0);
Symbol *ctorListSym = symtab->findUnderscore("__CTOR_LIST__");
Symbol *dtorListSym = symtab->findUnderscore("__DTOR_LIST__");
replaceSymbol<DefinedSynthetic>(ctorListSym, ctorListSym->getName(),
replaceSymbol<DefinedSynthetic>(dtorListSym, dtorListSym->getName(),
// Handles /section options to allow users to overwrite
// section attributes.
void Writer::setSectionPermissions() {
for (auto &p : config->section) {
StringRef name = p.first;
uint32_t perm = p.second;
for (OutputSection *sec : outputSections)
if (sec->name == name)
// Write section contents to a mmap'ed file.
void Writer::writeSections() {
// Record the number of sections to apply section index relocations
// against absolute symbols. See applySecIdx in Chunks.cpp..
DefinedAbsolute::numOutputSections = outputSections.size();
uint8_t *buf = buffer->getBufferStart();
for (OutputSection *sec : outputSections) {
uint8_t *secBuf = buf + sec->getFileOff();
// Fill gaps between functions in .text with INT3 instructions
// instead of leaving as NUL bytes (which can be interpreted as
// ADD instructions).
if (sec->header.Characteristics & IMAGE_SCN_CNT_CODE)
memset(secBuf, 0xCC, sec->getRawSize());
parallelForEach(sec->chunks, [&](Chunk *c) {
c->writeTo(secBuf + c->getRVA() - sec->getRVA());
void Writer::writeBuildId() {
// There are two important parts to the build ID.
// 1) If building with debug info, the COFF debug directory contains a
// timestamp as well as a Guid and Age of the PDB.
// 2) In all cases, the PE COFF file header also contains a timestamp.
// For reproducibility, instead of a timestamp we want to use a hash of the
// PE contents.
if (config->debug) {
assert(buildId && "BuildId is not set!");
// BuildId->BuildId was filled in when the PDB was written.
// At this point the only fields in the COFF file which remain unset are the
// "timestamp" in the COFF file header, and the ones in the coff debug
// directory. Now we can hash the file and write that hash to the various
// timestamp fields in the file.
StringRef outputFileData(
reinterpret_cast<const char *>(buffer->getBufferStart()),
uint32_t timestamp = config->timestamp;
uint64_t hash = 0;
bool generateSyntheticBuildId =
config->mingw && config->debug && config->pdbPath.empty();
if (config->repro || generateSyntheticBuildId)
hash = xxHash64(outputFileData);
if (config->repro)
timestamp = static_cast<uint32_t>(hash);
if (generateSyntheticBuildId) {
// For MinGW builds without a PDB file, we still generate a build id
// to allow associating a crash dump to the executable.
buildId->buildId->PDB70.CVSignature = OMF::Signature::PDB70;
buildId->buildId->PDB70.Age = 1;
memcpy(buildId->buildId->PDB70.Signature, &hash, 8);
// xxhash only gives us 8 bytes, so put some fixed data in the other half.
memcpy(&buildId->buildId->PDB70.Signature[8], "LLD PDB.", 8);
if (debugDirectory)
uint8_t *buf = buffer->getBufferStart();
buf += dosStubSize + sizeof(PEMagic);
object::coff_file_header *coffHeader =
reinterpret_cast<coff_file_header *>(buf);
coffHeader->TimeDateStamp = timestamp;
// Sort .pdata section contents according to PE/COFF spec 5.5.
void Writer::sortExceptionTable() {
if (!firstPdata)
// We assume .pdata contains function table entries only.
auto bufAddr = [&](Chunk *c) {
OutputSection *os = c->getOutputSection();
return buffer->getBufferStart() + os->getFileOff() + c->getRVA() -
uint8_t *begin = bufAddr(firstPdata);
uint8_t *end = bufAddr(lastPdata) + lastPdata->getSize();
if (config->machine == AMD64) {
struct Entry { ulittle32_t begin, end, unwind; };
if ((end - begin) % sizeof(Entry) != 0) {
fatal("unexpected .pdata size: " + Twine(end - begin) +
" is not a multiple of " + Twine(sizeof(Entry)));
MutableArrayRef<Entry>((Entry *)begin, (Entry *)end),
[](const Entry &a, const Entry &b) { return a.begin < b.begin; });
if (config->machine == ARMNT || config->machine == ARM64) {
struct Entry { ulittle32_t begin, unwind; };
if ((end - begin) % sizeof(Entry) != 0) {
fatal("unexpected .pdata size: " + Twine(end - begin) +
" is not a multiple of " + Twine(sizeof(Entry)));
MutableArrayRef<Entry>((Entry *)begin, (Entry *)end),
[](const Entry &a, const Entry &b) { return a.begin < b.begin; });
lld::errs() << "warning: don't know how to handle .pdata.\n";
// The CRT section contains, among other things, the array of function
// pointers that initialize every global variable that is not trivially
// constructed. The CRT calls them one after the other prior to invoking
// main().
// As per C++ spec, 3.6.2/2.3,
// "Variables with ordered initialization defined within a single
// translation unit shall be initialized in the order of their definitions
// in the translation unit"
// It is therefore critical to sort the chunks containing the function
// pointers in the order that they are listed in the object file (top to
// bottom), otherwise global objects might not be initialized in the
// correct order.
void Writer::sortCRTSectionChunks(std::vector<Chunk *> &chunks) {
auto sectionChunkOrder = [](const Chunk *a, const Chunk *b) {
auto sa = dyn_cast<SectionChunk>(a);
auto sb = dyn_cast<SectionChunk>(b);
assert(sa && sb && "Non-section chunks in CRT section!");
StringRef sAObj = sa->file->mb.getBufferIdentifier();
StringRef sBObj = sb->file->mb.getBufferIdentifier();
return sAObj == sBObj && sa->getSectionNumber() < sb->getSectionNumber();
llvm::stable_sort(chunks, sectionChunkOrder);
if (config->verbose) {
for (auto &c : chunks) {
auto sc = dyn_cast<SectionChunk>(c);
log(" " + sc->file->mb.getBufferIdentifier().str() +
", SectionID: " + Twine(sc->getSectionNumber()));
OutputSection *Writer::findSection(StringRef name) {
for (OutputSection *sec : outputSections)
if (sec->name == name)
return sec;
return nullptr;
uint32_t Writer::getSizeOfInitializedData() {
uint32_t res = 0;
for (OutputSection *s : outputSections)
if (s->header.Characteristics & IMAGE_SCN_CNT_INITIALIZED_DATA)
res += s->getRawSize();
return res;
// Add base relocations to .reloc section.
void Writer::addBaserels() {
if (!config->relocatable)
std::vector<Baserel> v;
for (OutputSection *sec : outputSections) {
if (sec->header.Characteristics & IMAGE_SCN_MEM_DISCARDABLE)
// Collect all locations for base relocations.
for (Chunk *c : sec->chunks)
// Add the addresses to .reloc section.
if (!v.empty())
// Add addresses to .reloc section. Note that addresses are grouped by page.
void Writer::addBaserelBlocks(std::vector<Baserel> &v) {
const uint32_t mask = ~uint32_t(pageSize - 1);
uint32_t page = v[0].rva & mask;
size_t i = 0, j = 1;
for (size_t e = v.size(); j < e; ++j) {
uint32_t p = v[j].rva & mask;
if (p == page)
relocSec->addChunk(make<BaserelChunk>(page, &v[i], &v[0] + j));
i = j;
page = p;
if (i == j)
relocSec->addChunk(make<BaserelChunk>(page, &v[i], &v[0] + j));
PartialSection *Writer::createPartialSection(StringRef name,
uint32_t outChars) {
PartialSection *&pSec = partialSections[{name, outChars}];
if (pSec)
return pSec;
pSec = make<PartialSection>(name, outChars);
return pSec;
PartialSection *Writer::findPartialSection(StringRef name, uint32_t outChars) {
auto it = partialSections.find({name, outChars});
if (it != partialSections.end())
return it->second;
return nullptr;
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index c3a11b199675..d6580430daf7 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -1,448 +1,454 @@
//===-- llvm/CodeGen/TargetFrameLowering.h ----------------------*- C++ -*-===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Interface to describe the layout of a stack frame on the target machine.
#include "llvm/CodeGen/MachineBasicBlock.h"
#include <vector>
namespace llvm {
class BitVector;
class CalleeSavedInfo;
class MachineFunction;
class RegScavenger;
namespace TargetStackID {
enum Value {
Default = 0,
SGPRSpill = 1,
SVEVector = 2,
NoAlloc = 255
/// Information about stack frame layout on the target. It holds the direction
/// of stack growth, the known stack alignment on entry to each function, and
/// the offset to the locals area.
/// The offset to the local area is the offset from the stack pointer on
/// function entry to the first location where function data (local variables,
/// spill locations) can be stored.
class TargetFrameLowering {
enum StackDirection {
StackGrowsUp, // Adding to the stack increases the stack address
StackGrowsDown // Adding to the stack decreases the stack address
// Maps a callee saved register to a stack slot with a fixed offset.
struct SpillSlot {
unsigned Reg;
int Offset; // Offset relative to stack pointer on function entry.
struct DwarfFrameBase {
// The frame base may be either a register (the default), the CFA,
// or a WebAssembly-specific location description.
enum FrameBaseKind { Register, CFA, WasmFrameBase } Kind;
struct WasmFrameBase {
unsigned Kind; // Wasm local, global, or value stack
unsigned Index;
union {
unsigned Reg;
struct WasmFrameBase WasmLoc;
} Location;
StackDirection StackDir;
Align StackAlignment;
Align TransientStackAlignment;
int LocalAreaOffset;
bool StackRealignable;
TargetFrameLowering(StackDirection D, Align StackAl, int LAO,
Align TransAl = Align(1), bool StackReal = true)
: StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl),
LocalAreaOffset(LAO), StackRealignable(StackReal) {}
virtual ~TargetFrameLowering();
// These methods return information that describes the abstract stack layout
// of the target machine.
/// getStackGrowthDirection - Return the direction the stack grows
StackDirection getStackGrowthDirection() const { return StackDir; }
/// getStackAlignment - This method returns the number of bytes to which the
/// stack pointer must be aligned on entry to a function. Typically, this
/// is the largest alignment for any data object in the target.
unsigned getStackAlignment() const { return StackAlignment.value(); }
/// getStackAlignment - This method returns the number of bytes to which the
/// stack pointer must be aligned on entry to a function. Typically, this
/// is the largest alignment for any data object in the target.
Align getStackAlign() const { return StackAlignment; }
/// alignSPAdjust - This method aligns the stack adjustment to the correct
/// alignment.
int alignSPAdjust(int SPAdj) const {
if (SPAdj < 0) {
SPAdj = -alignTo(-SPAdj, StackAlignment);
} else {
SPAdj = alignTo(SPAdj, StackAlignment);
return SPAdj;
/// getTransientStackAlignment - This method returns the number of bytes to
/// which the stack pointer must be aligned at all times, even between
/// calls.
LLVM_ATTRIBUTE_DEPRECATED(unsigned getTransientStackAlignment() const,
"Use getTransientStackAlign instead") {
return TransientStackAlignment.value();
/// getTransientStackAlignment - This method returns the number of bytes to
/// which the stack pointer must be aligned at all times, even between
/// calls.
Align getTransientStackAlign() const { return TransientStackAlignment; }
/// isStackRealignable - This method returns whether the stack can be
/// realigned.
bool isStackRealignable() const {
return StackRealignable;
/// Return the skew that has to be applied to stack alignment under
/// certain conditions (e.g. stack was adjusted before function \p MF
/// was called).
virtual unsigned getStackAlignmentSkew(const MachineFunction &MF) const;
+ /// This method returns whether or not it is safe for an object with the
+ /// given stack id to be bundled into the local area.
+ virtual bool isStackIdSafeForLocalArea(unsigned StackId) const {
+ return true;
+ }
/// getOffsetOfLocalArea - This method returns the offset of the local area
/// from the stack pointer on entrance to a function.
int getOffsetOfLocalArea() const { return LocalAreaOffset; }
/// isFPCloseToIncomingSP - Return true if the frame pointer is close to
/// the incoming stack pointer, false if it is close to the post-prologue
/// stack pointer.
virtual bool isFPCloseToIncomingSP() const { return true; }
/// assignCalleeSavedSpillSlots - Allows target to override spill slot
/// assignment logic. If implemented, assignCalleeSavedSpillSlots() should
/// assign frame slots to all CSI entries and return true. If this method
/// returns false, spill slots will be assigned using generic implementation.
/// assignCalleeSavedSpillSlots() may add, delete or rearrange elements of
/// CSI.
virtual bool
assignCalleeSavedSpillSlots(MachineFunction &MF,
const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const {
return false;
/// getCalleeSavedSpillSlots - This method returns a pointer to an array of
/// pairs, that contains an entry for each callee saved register that must be
/// spilled to a particular stack location if it is spilled.
/// Each entry in this array contains a <register,offset> pair, indicating the
/// fixed offset from the incoming stack pointer that each register should be
/// spilled at. If a register is not listed here, the code generator is
/// allowed to spill it anywhere it chooses.
virtual const SpillSlot *
getCalleeSavedSpillSlots(unsigned &NumEntries) const {
NumEntries = 0;
return nullptr;
/// targetHandlesStackFrameRounding - Returns true if the target is
/// responsible for rounding up the stack frame (probably at emitPrologue
/// time).
virtual bool targetHandlesStackFrameRounding() const {
return false;
/// Returns true if the target will correctly handle shrink wrapping.
virtual bool enableShrinkWrapping(const MachineFunction &MF) const {
return false;
/// Returns true if the stack slot holes in the fixed and callee-save stack
/// area should be used when allocating other stack locations to reduce stack
/// size.
virtual bool enableStackSlotScavenging(const MachineFunction &MF) const {
return false;
/// Returns true if the target can safely skip saving callee-saved registers
/// for noreturn nounwind functions.
virtual bool enableCalleeSaveSkip(const MachineFunction &MF) const;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
virtual void emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const = 0;
virtual void emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const = 0;
/// With basic block sections, emit callee saved frame moves for basic blocks
/// that are in a different section.
virtual void
emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const {}
virtual void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,
bool IsPrologue) const {}
/// Replace a StackProbe stub (if any) with the actual probe code inline
virtual void inlineStackProbe(MachineFunction &MF,
MachineBasicBlock &PrologueMBB) const {}
/// Adjust the prologue to have the function use segmented stacks. This works
/// by adding a check even before the "normal" function prologue.
virtual void adjustForSegmentedStacks(MachineFunction &MF,
MachineBasicBlock &PrologueMBB) const {}
/// Adjust the prologue to add Erlang Run-Time System (ERTS) specific code in
/// the assembly prologue to explicitly handle the stack.
virtual void adjustForHiPEPrologue(MachineFunction &MF,
MachineBasicBlock &PrologueMBB) const {}
/// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee
/// saved registers and returns true if it isn't possible / profitable to do
/// so by issuing a series of store instructions via
/// storeRegToStackSlot(). Returns false otherwise.
virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const {
return false;
/// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee
/// saved registers and returns true if it isn't possible / profitable to do
/// so by issuing a series of load instructions via loadRegToStackSlot().
/// If it returns true, and any of the registers in CSI is not restored,
/// it sets the corresponding Restored flag in CSI to false.
/// Returns false otherwise.
virtual bool
restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
MutableArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const {
return false;
/// Return true if the target wants to keep the frame pointer regardless of
/// the function attribute "frame-pointer".
virtual bool keepFramePointer(const MachineFunction &MF) const {
return false;
/// hasFP - Return true if the specified function should have a dedicated
/// frame pointer register. For most targets this is true only if the function
/// has variable sized allocas or if frame pointer elimination is disabled.
virtual bool hasFP(const MachineFunction &MF) const = 0;
/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
/// not required, we reserve argument space for call sites in the function
/// immediately on entry to the current function. This eliminates the need for
/// add/sub sp brackets around call sites. Returns true if the call frame is
/// included as part of the stack frame.
virtual bool hasReservedCallFrame(const MachineFunction &MF) const {
return !hasFP(MF);
/// canSimplifyCallFramePseudos - When possible, it's best to simplify the
/// call frame pseudo ops before doing frame index elimination. This is
/// possible only when frame index references between the pseudos won't
/// need adjusting for the call frame adjustments. Normally, that's true
/// if the function has a reserved call frame or a frame pointer. Some
/// targets (Thumb2, for example) may have more complicated criteria,
/// however, and can override this behavior.
virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const {
return hasReservedCallFrame(MF) || hasFP(MF);
// needsFrameIndexResolution - Do we need to perform FI resolution for
// this function. Normally, this is required only when the function
// has any stack objects. However, targets may want to override this.
virtual bool needsFrameIndexResolution(const MachineFunction &MF) const;
/// getFrameIndexReference - This method should return the base register
/// and offset used to reference a frame index location. The offset is
/// returned directly, and the base register is returned via FrameReg.
virtual int getFrameIndexReference(const MachineFunction &MF, int FI,
Register &FrameReg) const;
/// Same as \c getFrameIndexReference, except that the stack pointer (as
/// opposed to the frame pointer) will be the preferred value for \p
/// FrameReg. This is generally used for emitting statepoint or EH tables that
/// use offsets from RSP. If \p IgnoreSPUpdates is true, the returned
/// offset is only guaranteed to be valid with respect to the value of SP at
/// the end of the prologue.
virtual int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
Register &FrameReg,
bool IgnoreSPUpdates) const {
// Always safe to dispatch to getFrameIndexReference.
return getFrameIndexReference(MF, FI, FrameReg);
/// getNonLocalFrameIndexReference - This method returns the offset used to
/// reference a frame index location. The offset can be from either FP/BP/SP
/// based on which base register is returned by llvm.localaddress.
virtual int getNonLocalFrameIndexReference(const MachineFunction &MF,
int FI) const {
// By default, dispatch to getFrameIndexReference. Interested targets can
// override this.
Register FrameReg;
return getFrameIndexReference(MF, FI, FrameReg);
/// Returns the callee-saved registers as computed by determineCalleeSaves
/// in the BitVector \p SavedRegs.
virtual void getCalleeSaves(const MachineFunction &MF,
BitVector &SavedRegs) const;
/// This method determines which of the registers reported by
/// TargetRegisterInfo::getCalleeSavedRegs() should actually get saved.
/// The default implementation checks populates the \p SavedRegs bitset with
/// all registers which are modified in the function, targets may override
/// this function to save additional registers.
/// This method also sets up the register scavenger ensuring there is a free
/// register or a frameindex available.
/// This method should not be called by any passes outside of PEI, because
/// it may change state passed in by \p MF and \p RS. The preferred
/// interface outside PEI is getCalleeSaves.
virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const;
/// processFunctionBeforeFrameFinalized - This method is called immediately
/// before the specified function's frame layout (MF.getFrameInfo()) is
/// finalized. Once the frame is finalized, MO_FrameIndex operands are
/// replaced with direct constants. This method is optional.
virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS = nullptr) const {
/// processFunctionBeforeFrameIndicesReplaced - This method is called
/// immediately before MO_FrameIndex operands are eliminated, but after the
/// frame is finalized. This method is optional.
virtual void
processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
RegScavenger *RS = nullptr) const {}
virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const {
report_fatal_error("WinEH not implemented for this target");
/// This method is called during prolog/epilog code insertion to eliminate
/// call frame setup and destroy pseudo instructions (but only if the Target
/// is using them). It is responsible for eliminating these instructions,
/// replacing them with concrete instructions. This method need only be
/// implemented if using call frame setup/destroy pseudo instructions.
/// Returns an iterator pointing to the instruction after the replaced one.
virtual MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {
llvm_unreachable("Call Frame Pseudo Instructions do not exist on this "
/// Order the symbols in the local stack frame.
/// The list of objects that we want to order is in \p objectsToAllocate as
/// indices into the MachineFrameInfo. The array can be reordered in any way
/// upon return. The contents of the array, however, may not be modified (i.e.
/// only their order may be changed).
/// By default, just maintain the original order.
virtual void
orderFrameObjects(const MachineFunction &MF,
SmallVectorImpl<int> &objectsToAllocate) const {
/// Check whether or not the given \p MBB can be used as a prologue
/// for the target.
/// The prologue will be inserted first in this basic block.
/// This method is used by the shrink-wrapping pass to decide if
/// \p MBB will be correctly handled by the target.
/// As soon as the target enable shrink-wrapping without overriding
/// this method, we assume that each basic block is a valid
/// prologue.
virtual bool canUseAsPrologue(const MachineBasicBlock &MBB) const {
return true;
/// Check whether or not the given \p MBB can be used as a epilogue
/// for the target.
/// The epilogue will be inserted before the first terminator of that block.
/// This method is used by the shrink-wrapping pass to decide if
/// \p MBB will be correctly handled by the target.
/// As soon as the target enable shrink-wrapping without overriding
/// this method, we assume that each basic block is a valid
/// epilogue.
virtual bool canUseAsEpilogue(const MachineBasicBlock &MBB) const {
return true;
/// Returns the StackID that scalable vectors should be associated with.
virtual TargetStackID::Value getStackIDForScalableVectors() const {
return TargetStackID::Default;
virtual bool isSupportedStackID(TargetStackID::Value ID) const {
switch (ID) {
return false;
case TargetStackID::Default:
case TargetStackID::NoAlloc:
return true;
/// Check if given function is safe for not having callee saved registers.
/// This is used when interprocedural register allocation is enabled.
static bool isSafeForNoCSROpt(const Function &F);
/// Check if the no-CSR optimisation is profitable for the given function.
virtual bool isProfitableForNoCSROpt(const Function &F) const {
return true;
/// Return initial CFA offset value i.e. the one valid at the beginning of the
/// function (before any stack operations).
virtual int getInitialCFAOffset(const MachineFunction &MF) const;
/// Return initial CFA register value i.e. the one valid at the beginning of
/// the function (before any stack operations).
virtual Register getInitialCFARegister(const MachineFunction &MF) const;
/// Return the frame base information to be encoded in the DWARF subprogram
/// debug info.
virtual DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const;
} // End llvm namespace
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 74664098ce1d..33f122728d2a 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1,2118 +1,2121 @@
//===- BasicAliasAnalysis.cpp - Stateless Alias Analysis Impl -------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file defines the primary stateless implementation of the
// Alias Analysis interface that implements identities (two different
// globals cannot alias, etc), but does no stateful analysis.
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/PhiValues.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/KnownBits.h"
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <utility>
#define DEBUG_TYPE "basicaa"
using namespace llvm;
/// Enable analysis of recursive PHI nodes.
static cl::opt<bool> EnableRecPhiAnalysis("basic-aa-recphi", cl::Hidden,
/// By default, even on 32-bit architectures we use 64-bit integers for
/// calculations. This will allow us to more-aggressively decompose indexing
/// expressions calculated using i64 values (e.g., long long in C) which is
/// common enough to worry about.
static cl::opt<bool> ForceAtLeast64Bits("basic-aa-force-at-least-64b",
cl::Hidden, cl::init(true));
static cl::opt<bool> DoubleCalcBits("basic-aa-double-calc-bits",
cl::Hidden, cl::init(false));
/// SearchLimitReached / SearchTimes shows how often the limit of
/// to decompose GEPs is reached. It will affect the precision
/// of basic alias analysis.
STATISTIC(SearchLimitReached, "Number of times the limit to "
"decompose GEPs is reached");
STATISTIC(SearchTimes, "Number of times a GEP is decomposed");
/// Cutoff after which to stop analysing a set of phi nodes potentially involved
/// in a cycle. Because we are analysing 'through' phi nodes, we need to be
/// careful with value equivalence. We use reachability to make sure a value
/// cannot be involved in a cycle.
const unsigned MaxNumPhiBBsValueReachabilityCheck = 20;
// The max limit of the search depth in DecomposeGEPExpression() and
// GetUnderlyingObject(), both functions need to use the same search
// depth otherwise the algorithm in aliasGEP will assert.
static const unsigned MaxLookupSearchDepth = 6;
bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
FunctionAnalysisManager::Invalidator &Inv) {
// We don't care if this analysis itself is preserved, it has no state. But
// we need to check that the analyses it depends on have been. Note that we
// may be created without handles to some analyses and in that case don't
// depend on them.
if (Inv.invalidate<AssumptionAnalysis>(Fn, PA) ||
(DT && Inv.invalidate<DominatorTreeAnalysis>(Fn, PA)) ||
(LI && Inv.invalidate<LoopAnalysis>(Fn, PA)) ||
(PV && Inv.invalidate<PhiValuesAnalysis>(Fn, PA)))
return true;
// Otherwise this analysis result remains valid.
return false;
// Useful predicates
/// Returns true if the pointer is to a function-local object that never
/// escapes from the function.
static bool isNonEscapingLocalObject(
const Value *V,
SmallDenseMap<const Value *, bool, 8> *IsCapturedCache = nullptr) {
SmallDenseMap<const Value *, bool, 8>::iterator CacheIt;
if (IsCapturedCache) {
bool Inserted;
std::tie(CacheIt, Inserted) = IsCapturedCache->insert({V, false});
if (!Inserted)
// Found cached result, return it!
return CacheIt->second;
// If this is a local allocation, check to see if it escapes.
if (isa<AllocaInst>(V) || isNoAliasCall(V)) {
// Set StoreCaptures to True so that we can assume in our callers that the
// pointer is not the result of a load instruction. Currently
// PointerMayBeCaptured doesn't have any special analysis for the
// StoreCaptures=false case; if it did, our callers could be refined to be
// more precise.
auto Ret = !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
if (IsCapturedCache)
CacheIt->second = Ret;
return Ret;
// If this is an argument that corresponds to a byval or noalias argument,
// then it has not escaped before entering the function. Check if it escapes
// inside the function.
if (const Argument *A = dyn_cast<Argument>(V))
if (A->hasByValAttr() || A->hasNoAliasAttr()) {
// Note even if the argument is marked nocapture, we still need to check
// for copies made inside the function. The nocapture attribute only
// specifies that there are no copies made that outlive the function.
auto Ret = !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
if (IsCapturedCache)
CacheIt->second = Ret;
return Ret;
return false;
/// Returns true if the pointer is one which would have been considered an
/// escape by isNonEscapingLocalObject.
static bool isEscapeSource(const Value *V) {
if (isa<CallBase>(V))
return true;
if (isa<Argument>(V))
return true;
// The load case works because isNonEscapingLocalObject considers all
// stores to be escapes (it passes true for the StoreCaptures argument
// to PointerMayBeCaptured).
if (isa<LoadInst>(V))
return true;
return false;
/// Returns the size of the object specified by V or UnknownSize if unknown.
static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
const TargetLibraryInfo &TLI,
bool NullIsValidLoc,
bool RoundToAlign = false) {
uint64_t Size;
ObjectSizeOpts Opts;
Opts.RoundToAlign = RoundToAlign;
Opts.NullIsUnknownSize = NullIsValidLoc;
if (getObjectSize(V, Size, DL, &TLI, Opts))
return Size;
return MemoryLocation::UnknownSize;
/// Returns true if we can prove that the object specified by V is smaller than
/// Size.
static bool isObjectSmallerThan(const Value *V, uint64_t Size,
const DataLayout &DL,
const TargetLibraryInfo &TLI,
bool NullIsValidLoc) {
// Note that the meanings of the "object" are slightly different in the
// following contexts:
// c1: llvm::getObjectSize()
// c2: llvm.objectsize() intrinsic
// c3: isObjectSmallerThan()
// c1 and c2 share the same meaning; however, the meaning of "object" in c3
// refers to the "entire object".
// Consider this example:
// char *p = (char*)malloc(100)
// char *q = p+80;
// In the context of c1 and c2, the "object" pointed by q refers to the
// stretch of memory of q[0:19]. So, getObjectSize(q) should return 20.
// However, in the context of c3, the "object" refers to the chunk of memory
// being allocated. So, the "object" has 100 bytes, and q points to the middle
// the "object". In case q is passed to isObjectSmallerThan() as the 1st
// parameter, before the llvm::getObjectSize() is called to get the size of
// entire object, we should:
// - either rewind the pointer q to the base-address of the object in
// question (in this case rewind to p), or
// - just give up. It is up to caller to make sure the pointer is pointing
// to the base address the object.
// We go for 2nd option for simplicity.
if (!isIdentifiedObject(V))
return false;
// This function needs to use the aligned object size because we allow
// reads a bit past the end given sufficient alignment.
uint64_t ObjectSize = getObjectSize(V, DL, TLI, NullIsValidLoc,
/*RoundToAlign*/ true);
return ObjectSize != MemoryLocation::UnknownSize && ObjectSize < Size;
/// Return the minimal extent from \p V to the end of the underlying object,
/// assuming the result is used in an aliasing query. E.g., we do use the query
/// location size and the fact that null pointers cannot alias here.
static uint64_t getMinimalExtentFrom(const Value &V,
const LocationSize &LocSize,
const DataLayout &DL,
bool NullIsValidLoc) {
// If we have dereferenceability information we know a lower bound for the
// extent as accesses for a lower offset would be valid. We need to exclude
// the "or null" part if null is a valid pointer.
bool CanBeNull;
uint64_t DerefBytes = V.getPointerDereferenceableBytes(DL, CanBeNull);
DerefBytes = (CanBeNull && NullIsValidLoc) ? 0 : DerefBytes;
// If queried with a precise location size, we assume that location size to be
// accessed, thus valid.
if (LocSize.isPrecise())
DerefBytes = std::max(DerefBytes, LocSize.getValue());
return DerefBytes;
/// Returns true if we can prove that the object specified by V has size Size.
static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
const TargetLibraryInfo &TLI, bool NullIsValidLoc) {
uint64_t ObjectSize = getObjectSize(V, DL, TLI, NullIsValidLoc);
return ObjectSize != MemoryLocation::UnknownSize && ObjectSize == Size;
// GetElementPtr Instruction Decomposition and Analysis
/// Analyzes the specified value as a linear expression: "A*V + B", where A and
/// B are constant integers.
/// Returns the scale and offset values as APInts and return V as a Value*, and
/// return whether we looked through any sign or zero extends. The incoming
/// Value is known to have IntegerType, and it may already be sign or zero
/// extended.
/// Note that this looks through extends, so the high bits may not be
/// represented in the result.
/*static*/ const Value *BasicAAResult::GetLinearExpression(
const Value *V, APInt &Scale, APInt &Offset, unsigned &ZExtBits,
unsigned &SExtBits, const DataLayout &DL, unsigned Depth,
AssumptionCache *AC, DominatorTree *DT, bool &NSW, bool &NUW) {
assert(V->getType()->isIntegerTy() && "Not an integer value");
// Limit our recursion depth.
if (Depth == 6) {
Scale = 1;
Offset = 0;
return V;
if (const ConstantInt *Const = dyn_cast<ConstantInt>(V)) {
// If it's a constant, just convert it to an offset and remove the variable.
// If we've been called recursively, the Offset bit width will be greater
// than the constant's (the Offset's always as wide as the outermost call),
// so we'll zext here and process any extension in the isa<SExtInst> &
// isa<ZExtInst> cases below.
Offset += Const->getValue().zextOrSelf(Offset.getBitWidth());
assert(Scale == 0 && "Constant values don't have a scale");
return V;
if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
// If we've been called recursively, then Offset and Scale will be wider
// than the BOp operands. We'll always zext it here as we'll process sign
// extensions below (see the isa<SExtInst> / isa<ZExtInst> cases).
APInt RHS = RHSC->getValue().zextOrSelf(Offset.getBitWidth());
switch (BOp->getOpcode()) {
// We don't understand this instruction, so we can't decompose it any
// further.
Scale = 1;
Offset = 0;
return V;
case Instruction::Or:
// X|C == X+C if all the bits in C are unset in X. Otherwise we can't
// analyze it.
if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), DL, 0, AC,
BOp, DT)) {
Scale = 1;
Offset = 0;
return V;
case Instruction::Add:
V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
Offset += RHS;
case Instruction::Sub:
V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
Offset -= RHS;
case Instruction::Mul:
V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
Offset *= RHS;
Scale *= RHS;
case Instruction::Shl:
V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, ZExtBits,
SExtBits, DL, Depth + 1, AC, DT, NSW, NUW);
// We're trying to linearize an expression of the kind:
// shl i8 -128, 36
// where the shift count exceeds the bitwidth of the type.
// We can't decompose this further (the expression would return
// a poison value).
if (Offset.getBitWidth() < RHS.getLimitedValue() ||
Scale.getBitWidth() < RHS.getLimitedValue()) {
Scale = 1;
Offset = 0;
return V;
Offset <<= RHS.getLimitedValue();
Scale <<= RHS.getLimitedValue();
// the semantics of nsw and nuw for left shifts don't match those of
// multiplications, so we won't propagate them.
NSW = NUW = false;
return V;
if (isa<OverflowingBinaryOperator>(BOp)) {
NUW &= BOp->hasNoUnsignedWrap();
NSW &= BOp->hasNoSignedWrap();
return V;
// Since GEP indices are sign extended anyway, we don't care about the high
// bits of a sign or zero extended value - just scales and offsets. The
// extensions have to be consistent though.
if (isa<SExtInst>(V) || isa<ZExtInst>(V)) {
Value *CastOp = cast<CastInst>(V)->getOperand(0);
unsigned NewWidth = V->getType()->getPrimitiveSizeInBits();
unsigned SmallWidth = CastOp->getType()->getPrimitiveSizeInBits();
unsigned OldZExtBits = ZExtBits, OldSExtBits = SExtBits;
const Value *Result =
GetLinearExpression(CastOp, Scale, Offset, ZExtBits, SExtBits, DL,
Depth + 1, AC, DT, NSW, NUW);
// zext(zext(%x)) == zext(%x), and similarly for sext; we'll handle this
// by just incrementing the number of bits we've extended by.
unsigned ExtendedBy = NewWidth - SmallWidth;
if (isa<SExtInst>(V) && ZExtBits == 0) {
// sext(sext(%x, a), b) == sext(%x, a + b)
if (NSW) {
// We haven't sign-wrapped, so it's valid to decompose sext(%x + c)
// into sext(%x) + sext(c). We'll sext the Offset ourselves:
unsigned OldWidth = Offset.getBitWidth();
Offset = Offset.trunc(SmallWidth).sext(NewWidth).zextOrSelf(OldWidth);
} else {
// We may have signed-wrapped, so don't decompose sext(%x + c) into
// sext(%x) + sext(c)
Scale = 1;
Offset = 0;
Result = CastOp;
ZExtBits = OldZExtBits;
SExtBits = OldSExtBits;
SExtBits += ExtendedBy;
} else {
// sext(zext(%x, a), b) = zext(zext(%x, a), b) = zext(%x, a + b)
if (!NUW) {
// We may have unsigned-wrapped, so don't decompose zext(%x + c) into
// zext(%x) + zext(c)
Scale = 1;
Offset = 0;
Result = CastOp;
ZExtBits = OldZExtBits;
SExtBits = OldSExtBits;
ZExtBits += ExtendedBy;
return Result;
Scale = 1;
Offset = 0;
return V;
/// To ensure a pointer offset fits in an integer of size PointerSize
/// (in bits) when that size is smaller than the maximum pointer size. This is
/// an issue, for example, in particular for 32b pointers with negative indices
/// that rely on two's complement wrap-arounds for precise alias information
/// where the maximum pointer size is 64b.
static APInt adjustToPointerSize(const APInt &Offset, unsigned PointerSize) {
assert(PointerSize <= Offset.getBitWidth() && "Invalid PointerSize!");
unsigned ShiftBits = Offset.getBitWidth() - PointerSize;
return (Offset << ShiftBits).ashr(ShiftBits);
static unsigned getMaxPointerSize(const DataLayout &DL) {
unsigned MaxPointerSize = DL.getMaxPointerSizeInBits();
if (MaxPointerSize < 64 && ForceAtLeast64Bits) MaxPointerSize = 64;
if (DoubleCalcBits) MaxPointerSize *= 2;
return MaxPointerSize;
/// If V is a symbolic pointer expression, decompose it into a base pointer
/// with a constant offset and a number of scaled symbolic offsets.
/// The scaled symbolic offsets (represented by pairs of a Value* and a scale
/// in the VarIndices vector) are Value*'s that are known to be scaled by the
/// specified amount, but which may have other unrepresented high bits. As
/// such, the gep cannot necessarily be reconstructed from its decomposed form.
/// When DataLayout is around, this function is capable of analyzing everything
/// that GetUnderlyingObject can look through. To be able to do that
/// GetUnderlyingObject and DecomposeGEPExpression must use the same search
/// depth (MaxLookupSearchDepth). When DataLayout not is around, it just looks
/// through pointer casts.
bool BasicAAResult::DecomposeGEPExpression(const Value *V,
DecomposedGEP &Decomposed, const DataLayout &DL, AssumptionCache *AC,
DominatorTree *DT) {
// Limit recursion depth to limit compile time in crazy cases.
unsigned MaxLookup = MaxLookupSearchDepth;
unsigned MaxPointerSize = getMaxPointerSize(DL);
do {
// See if this is a bitcast or GEP.
const Operator *Op = dyn_cast<Operator>(V);
if (!Op) {
// The only non-operator case we can handle are GlobalAliases.
if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
if (!GA->isInterposable()) {
V = GA->getAliasee();
Decomposed.Base = V;
return false;
if (Op->getOpcode() == Instruction::BitCast ||
Op->getOpcode() == Instruction::AddrSpaceCast) {
V = Op->getOperand(0);
const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
if (!GEPOp) {
if (const auto *PHI = dyn_cast<PHINode>(V)) {
// Look through single-arg phi nodes created by LCSSA.
if (PHI->getNumIncomingValues() == 1) {
V = PHI->getIncomingValue(0);
} else if (const auto *Call = dyn_cast<CallBase>(V)) {
// CaptureTracking can know about special capturing properties of some
// intrinsics like, that can't be expressed with
// the attributes, but have properties like returning aliasing pointer.
// Because some analysis may assume that nocaptured pointer is not
// returned from some special intrinsic (because function would have to
// be marked with returns attribute), it is crucial to use this function
// because it should be in sync with CaptureTracking. Not using it may
// cause weird miscompilations where 2 aliasing pointers are assumed to
// noalias.
if (auto *RP = getArgumentAliasingToReturnedPointer(Call, false)) {
V = RP;
Decomposed.Base = V;
return false;
// Don't attempt to analyze GEPs over unsized objects.
if (!GEPOp->getSourceElementType()->isSized()) {
Decomposed.Base = V;
return false;
// Don't attempt to analyze GEPs if index scale is not a compile-time
// constant.
if (isa<ScalableVectorType>(GEPOp->getSourceElementType())) {
Decomposed.Base = V;
Decomposed.HasCompileTimeConstantScale = false;
return false;
unsigned AS = GEPOp->getPointerAddressSpace();
// Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
gep_type_iterator GTI = gep_type_begin(GEPOp);
unsigned PointerSize = DL.getPointerSizeInBits(AS);
// Assume all GEP operands are constants until proven otherwise.
bool GepHasConstantOffset = true;
for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end();
I != E; ++I, ++GTI) {
const Value *Index = *I;
// Compute the (potentially symbolic) offset in bytes for this index.
if (StructType *STy = GTI.getStructTypeOrNull()) {
// For a struct, add the member offset.
unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
if (FieldNo == 0)
Decomposed.StructOffset +=
// For an array/pointer, add the element offset, explicitly scaled.
if (const ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
if (CIdx->isZero())
Decomposed.OtherOffset +=
(DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize() *
GepHasConstantOffset = false;
APInt Scale(MaxPointerSize,
unsigned ZExtBits = 0, SExtBits = 0;
// If the integer type is smaller than the pointer size, it is implicitly
// sign extended to pointer size.
unsigned Width = Index->getType()->getIntegerBitWidth();
if (PointerSize > Width)
SExtBits += PointerSize - Width;
// Use GetLinearExpression to decompose the index into a C1*V+C2 form.
APInt IndexScale(Width, 0), IndexOffset(Width, 0);
bool NSW = true, NUW = true;
const Value *OrigIndex = Index;
Index = GetLinearExpression(Index, IndexScale, IndexOffset, ZExtBits,
SExtBits, DL, 0, AC, DT, NSW, NUW);
// The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
// This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
// It can be the case that, even through C1*V+C2 does not overflow for
// relevant values of V, (C2*Scale) can overflow. In that case, we cannot
// decompose the expression in this way.
// FIXME: C1*Scale and the other operations in the decomposed
// (C1*Scale)*V+C2*Scale can also overflow. We should check for this
// possibility.
APInt WideScaledOffset = IndexOffset.sextOrTrunc(MaxPointerSize*2) *
if (WideScaledOffset.getMinSignedBits() > MaxPointerSize) {
Index = OrigIndex;
IndexScale = 1;
IndexOffset = 0;
ZExtBits = SExtBits = 0;
if (PointerSize > Width)
SExtBits += PointerSize - Width;
} else {
Decomposed.OtherOffset += IndexOffset.sextOrTrunc(MaxPointerSize) * Scale;
Scale *= IndexScale.sextOrTrunc(MaxPointerSize);
// If we already had an occurrence of this index variable, merge this
// scale into it. For example, we want to handle:
// A[x][x] -> x*16 + x*4 -> x*20
// This also ensures that 'x' only appears in the index list once.
for (unsigned i = 0, e = Decomposed.VarIndices.size(); i != e; ++i) {
if (Decomposed.VarIndices[i].V == Index &&
Decomposed.VarIndices[i].ZExtBits == ZExtBits &&
Decomposed.VarIndices[i].SExtBits == SExtBits) {
Scale += Decomposed.VarIndices[i].Scale;
Decomposed.VarIndices.erase(Decomposed.VarIndices.begin() + i);
// Make sure that we have a scale that makes sense for this target's
// pointer size.
Scale = adjustToPointerSize(Scale, PointerSize);
if (!!Scale) {
VariableGEPIndex Entry = {Index, ZExtBits, SExtBits, Scale};
// Take care of wrap-arounds
if (GepHasConstantOffset) {
Decomposed.StructOffset =
adjustToPointerSize(Decomposed.StructOffset, PointerSize);
Decomposed.OtherOffset =
adjustToPointerSize(Decomposed.OtherOffset, PointerSize);
// Analyze the base pointer next.
V = GEPOp->getOperand(0);
} while (--MaxLookup);
// If the chain of expressions is too deep, just return early.
Decomposed.Base = V;
return true;
/// Returns whether the given pointer value points to memory that is local to
/// the function, with global constants being considered local to all
/// functions.
bool BasicAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
AAQueryInfo &AAQI, bool OrLocal) {
assert(Visited.empty() && "Visited must be cleared after use!");
unsigned MaxLookup = 8;
SmallVector<const Value *, 16> Worklist;
do {
const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), DL);
if (!Visited.insert(V).second) {
return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
// An alloca instruction defines local memory.
if (OrLocal && isa<AllocaInst>(V))
// A global constant counts as local memory for our purposes.
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
// Note: this doesn't require GV to be "ODR" because it isn't legal for a
// global to be marked constant in some modules and non-constant in
// others. GV may even be a declaration, not a definition.
if (!GV->isConstant()) {
return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
// If both select values point to local memory, then so does the select.
if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
// If all values incoming to a phi node point to local memory, then so does
// the phi.
if (const PHINode *PN = dyn_cast<PHINode>(V)) {
// Don't bother inspecting phi nodes with many operands.
if (PN->getNumIncomingValues() > MaxLookup) {
return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
for (Value *IncValue : PN->incoming_values())
// Otherwise be conservative.
return AAResultBase::pointsToConstantMemory(Loc, AAQI, OrLocal);
} while (!Worklist.empty() && --MaxLookup);
return Worklist.empty();
/// Returns the behavior when calling the given call site.
FunctionModRefBehavior BasicAAResult::getModRefBehavior(const CallBase *Call) {
if (Call->doesNotAccessMemory())
// Can't do better than this.
return FMRB_DoesNotAccessMemory;
FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
// If the callsite knows it only reads memory, don't return worse
// than that.
if (Call->onlyReadsMemory())
Min = FMRB_OnlyReadsMemory;
else if (Call->doesNotReadMemory())
Min = FMRB_OnlyWritesMemory;
if (Call->onlyAccessesArgMemory())
Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
else if (Call->onlyAccessesInaccessibleMemory())
Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleMem);
else if (Call->onlyAccessesInaccessibleMemOrArgMem())
Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleOrArgMem);
// If the call has operand bundles then aliasing attributes from the function
// it calls do not directly apply to the call. This can be made more precise
// in the future.
if (!Call->hasOperandBundles())
if (const Function *F = Call->getCalledFunction())
Min =
FunctionModRefBehavior(Min & getBestAAResults().getModRefBehavior(F));
return Min;
/// Returns the behavior when calling the given function. For use when the call
/// site is not known.
FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
// If the function declares it doesn't access memory, we can't do better.
if (F->doesNotAccessMemory())
return FMRB_DoesNotAccessMemory;
FunctionModRefBehavior Min = FMRB_UnknownModRefBehavior;
// If the function declares it only reads memory, go with that.
if (F->onlyReadsMemory())
Min = FMRB_OnlyReadsMemory;
else if (F->doesNotReadMemory())
Min = FMRB_OnlyWritesMemory;
if (F->onlyAccessesArgMemory())
Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
else if (F->onlyAccessesInaccessibleMemory())
Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleMem);
else if (F->onlyAccessesInaccessibleMemOrArgMem())
Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesInaccessibleOrArgMem);
return Min;
/// Returns true if this is a writeonly (i.e Mod only) parameter.
static bool isWriteOnlyParam(const CallBase *Call, unsigned ArgIdx,
const TargetLibraryInfo &TLI) {
if (Call->paramHasAttr(ArgIdx, Attribute::WriteOnly))
return true;
// We can bound the aliasing properties of memset_pattern16 just as we can
// for memcpy/memset. This is particularly important because the
// LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
// whenever possible.
// FIXME Consider handling this in InferFunctionAttr.cpp together with other
// attributes.
LibFunc F;
if (Call->getCalledFunction() &&
TLI.getLibFunc(*Call->getCalledFunction(), F) &&
F == LibFunc_memset_pattern16 && TLI.has(F))
if (ArgIdx == 0)
return true;
// TODO: memset_pattern4, memset_pattern8
// TODO: _chk variants
// TODO: strcmp, strcpy
return false;
ModRefInfo BasicAAResult::getArgModRefInfo(const CallBase *Call,
unsigned ArgIdx) {
// Checking for known builtin intrinsics and target library functions.
if (isWriteOnlyParam(Call, ArgIdx, TLI))
return ModRefInfo::Mod;
if (Call->paramHasAttr(ArgIdx, Attribute::ReadOnly))
return ModRefInfo::Ref;
if (Call->paramHasAttr(ArgIdx, Attribute::ReadNone))
return ModRefInfo::NoModRef;
return AAResultBase::getArgModRefInfo(Call, ArgIdx);
static bool isIntrinsicCall(const CallBase *Call, Intrinsic::ID IID) {
const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Call);
return II && II->getIntrinsicID() == IID;
#ifndef NDEBUG
static const Function *getParent(const Value *V) {
if (const Instruction *inst = dyn_cast<Instruction>(V)) {
if (!inst->getParent())
return nullptr;
return inst->getParent()->getParent();
if (const Argument *arg = dyn_cast<Argument>(V))
return arg->getParent();
return nullptr;
static bool notDifferentParent(const Value *O1, const Value *O2) {
const Function *F1 = getParent(O1);
const Function *F2 = getParent(O2);
return !F1 || !F2 || F1 == F2;
AliasResult BasicAAResult::alias(const MemoryLocation &LocA,
const MemoryLocation &LocB,
AAQueryInfo &AAQI) {
assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
"BasicAliasAnalysis doesn't support interprocedural queries.");
// If we have a directly cached entry for these locations, we have recursed
// through this once, so just return the cached results. Notably, when this
// happens, we don't clear the cache.
auto CacheIt = AAQI.AliasCache.find(AAQueryInfo::LocPair(LocA, LocB));
if (CacheIt != AAQI.AliasCache.end())
return CacheIt->second;
CacheIt = AAQI.AliasCache.find(AAQueryInfo::LocPair(LocB, LocA));
if (CacheIt != AAQI.AliasCache.end())
return CacheIt->second;
AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags, LocB.Ptr,
LocB.Size, LocB.AATags, AAQI);
return Alias;
/// Checks to see if the specified callsite can clobber the specified memory
/// object.
/// Since we only look at local properties of this function, we really can't
/// say much about this query. We do, however, use simple "address taken"
/// analysis on local objects.
ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
const MemoryLocation &Loc,
AAQueryInfo &AAQI) {
assert(notDifferentParent(Call, Loc.Ptr) &&
"AliasAnalysis query involving multiple functions!");
const Value *Object = GetUnderlyingObject(Loc.Ptr, DL);
// Calls marked 'tail' cannot read or write allocas from the current frame
// because the current frame might be destroyed by the time they run. However,
// a tail call may use an alloca with byval. Calling with byval copies the
// contents of the alloca into argument registers or stack slots, so there is
// no lifetime issue.
if (isa<AllocaInst>(Object))
if (const CallInst *CI = dyn_cast<CallInst>(Call))
if (CI->isTailCall() &&
return ModRefInfo::NoModRef;
// Stack restore is able to modify unescaped dynamic allocas. Assume it may
// modify them even though the alloca is not escaped.
if (auto *AI = dyn_cast<AllocaInst>(Object))
if (!AI->isStaticAlloca() && isIntrinsicCall(Call, Intrinsic::stackrestore))
return ModRefInfo::Mod;
// If the pointer is to a locally allocated object that does not escape,
// then the call can not mod/ref the pointer unless the call takes the pointer
// as an argument, and itself doesn't capture it.
if (!isa<Constant>(Object) && Call != Object &&
isNonEscapingLocalObject(Object, &AAQI.IsCapturedCache)) {
// Optimistically assume that call doesn't touch Object and check this
// assumption in the following loop.
ModRefInfo Result = ModRefInfo::NoModRef;
bool IsMustAlias = true;
unsigned OperandNo = 0;
for (auto CI = Call->data_operands_begin(), CE = Call->data_operands_end();
CI != CE; ++CI, ++OperandNo) {
// Only look at the no-capture or byval pointer arguments. If this
// pointer were passed to arguments that were neither of these, then it
// couldn't be no-capture.
if (!(*CI)->getType()->isPointerTy() ||
(!Call->doesNotCapture(OperandNo) &&
OperandNo < Call->getNumArgOperands() &&
// Call doesn't access memory through this operand, so we don't care
// if it aliases with Object.
if (Call->doesNotAccessMemory(OperandNo))
// If this is a no-capture pointer argument, see if we can tell that it
// is impossible to alias the pointer we're checking.
AliasResult AR = getBestAAResults().alias(MemoryLocation(*CI),
MemoryLocation(Object), AAQI);
if (AR != MustAlias)
IsMustAlias = false;
// Operand doesn't alias 'Object', continue looking for other aliases
if (AR == NoAlias)
// Operand aliases 'Object', but call doesn't modify it. Strengthen
// initial assumption and keep looking in case if there are more aliases.
if (Call->onlyReadsMemory(OperandNo)) {
Result = setRef(Result);
// Operand aliases 'Object' but call only writes into it.
if (Call->doesNotReadMemory(OperandNo)) {
Result = setMod(Result);
// This operand aliases 'Object' and call reads and writes into it.
// Setting ModRef will not yield an early return below, MustAlias is not
// used further.
Result = ModRefInfo::ModRef;
// No operand aliases, reset Must bit. Add below if at least one aliases
// and all aliases found are MustAlias.
if (isNoModRef(Result))
IsMustAlias = false;
// Early return if we improved mod ref information
if (!isModAndRefSet(Result)) {
if (isNoModRef(Result))
return ModRefInfo::NoModRef;
return IsMustAlias ? setMust(Result) : clearMust(Result);
// If the call is malloc/calloc like, we can assume that it doesn't
// modify any IR visible value. This is only valid because we assume these
// routines do not read values visible in the IR. TODO: Consider special
// casing realloc and strdup routines which access only their arguments as
// well. Or alternatively, replace all of this with inaccessiblememonly once
// that's implemented fully.
if (isMallocOrCallocLikeFn(Call, &TLI)) {
// Be conservative if the accessed pointer may alias the allocation -
// fallback to the generic handling below.
if (getBestAAResults().alias(MemoryLocation(Call), Loc, AAQI) == NoAlias)
return ModRefInfo::NoModRef;
// The semantics of memcpy intrinsics forbid overlap between their respective
// operands, i.e., source and destination of any given memcpy must no-alias.
// If Loc must-aliases either one of these two locations, then it necessarily
// no-aliases the other.
if (auto *Inst = dyn_cast<AnyMemCpyInst>(Call)) {
AliasResult SrcAA, DestAA;
if ((SrcAA = getBestAAResults().alias(MemoryLocation::getForSource(Inst),
Loc, AAQI)) == MustAlias)
// Loc is exactly the memcpy source thus disjoint from memcpy dest.
return ModRefInfo::Ref;
if ((DestAA = getBestAAResults().alias(MemoryLocation::getForDest(Inst),
Loc, AAQI)) == MustAlias)
// The converse case.
return ModRefInfo::Mod;
// It's also possible for Loc to alias both src and dest, or neither.
ModRefInfo rv = ModRefInfo::NoModRef;
if (SrcAA != NoAlias)
rv = setRef(rv);
if (DestAA != NoAlias)
rv = setMod(rv);
return rv;
// While the assume intrinsic is marked as arbitrarily writing so that
// proper control dependencies will be maintained, it never aliases any
// particular memory location.
if (isIntrinsicCall(Call, Intrinsic::assume))
return ModRefInfo::NoModRef;
// Like assumes, guard intrinsics are also marked as arbitrarily writing so
// that proper control dependencies are maintained but they never mods any
// particular memory location.
// *Unlike* assumes, guard intrinsics are modeled as reading memory since the
// heap state at the point the guard is issued needs to be consistent in case
// the guard invokes the "deopt" continuation.
if (isIntrinsicCall(Call, Intrinsic::experimental_guard))
return ModRefInfo::Ref;
// Like assumes, invariant.start intrinsics were also marked as arbitrarily
// writing so that proper control dependencies are maintained but they never
// mod any particular memory location visible to the IR.
// *Unlike* assumes (which are now modeled as NoModRef), invariant.start
// intrinsic is now modeled as reading memory. This prevents hoisting the
// invariant.start intrinsic over stores. Consider:
// *ptr = 40;
// *ptr = 50;
// invariant_start(ptr)
// int val = *ptr;
// print(val);
// This cannot be transformed to:
// *ptr = 40;
// invariant_start(ptr)
// *ptr = 50;
// int val = *ptr;
// print(val);
// The transformation will cause the second store to be ignored (based on
// rules of invariant.start) and print 40, while the first program always
// prints 50.
if (isIntrinsicCall(Call, Intrinsic::invariant_start))
return ModRefInfo::Ref;
// The AAResultBase base class has some smarts, lets use them.
return AAResultBase::getModRefInfo(Call, Loc, AAQI);
ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call1,
const CallBase *Call2,
AAQueryInfo &AAQI) {
// While the assume intrinsic is marked as arbitrarily writing so that
// proper control dependencies will be maintained, it never aliases any
// particular memory location.
if (isIntrinsicCall(Call1, Intrinsic::assume) ||
isIntrinsicCall(Call2, Intrinsic::assume))
return ModRefInfo::NoModRef;
// Like assumes, guard intrinsics are also marked as arbitrarily writing so
// that proper control dependencies are maintained but they never mod any
// particular memory location.
// *Unlike* assumes, guard intrinsics are modeled as reading memory since the
// heap state at the point the guard is issued needs to be consistent in case
// the guard invokes the "deopt" continuation.
// NB! This function is *not* commutative, so we special case two
// possibilities for guard intrinsics.
if (isIntrinsicCall(Call1, Intrinsic::experimental_guard))
return isModSet(createModRefInfo(getModRefBehavior(Call2)))
? ModRefInfo::Ref
: ModRefInfo::NoModRef;
if (isIntrinsicCall(Call2, Intrinsic::experimental_guard))
return isModSet(createModRefInfo(getModRefBehavior(Call1)))
? ModRefInfo::Mod
: ModRefInfo::NoModRef;
// The AAResultBase base class has some smarts, lets use them.
return AAResultBase::getModRefInfo(Call1, Call2, AAQI);
/// Provide ad-hoc rules to disambiguate accesses through two GEP operators,
/// both having the exact same pointer operand.
static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
LocationSize MaybeV1Size,
const GEPOperator *GEP2,
LocationSize MaybeV2Size,
const DataLayout &DL) {
assert(GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
GEP1->getPointerOperandType() == GEP2->getPointerOperandType() &&
"Expected GEPs with the same pointer operand");
// Try to determine whether GEP1 and GEP2 index through arrays, into structs,
// such that the struct field accesses provably cannot alias.
// We also need at least two indices (the pointer, and the struct field).
if (GEP1->getNumIndices() != GEP2->getNumIndices() ||
GEP1->getNumIndices() < 2)
return MayAlias;
// If we don't know the size of the accesses through both GEPs, we can't
// determine whether the struct fields accessed can't alias.
if (MaybeV1Size == LocationSize::unknown() ||
MaybeV2Size == LocationSize::unknown())
return MayAlias;
const uint64_t V1Size = MaybeV1Size.getValue();
const uint64_t V2Size = MaybeV2Size.getValue();
ConstantInt *C1 =
dyn_cast<ConstantInt>(GEP1->getOperand(GEP1->getNumOperands() - 1));
ConstantInt *C2 =
dyn_cast<ConstantInt>(GEP2->getOperand(GEP2->getNumOperands() - 1));
// If the last (struct) indices are constants and are equal, the other indices
// might be also be dynamically equal, so the GEPs can alias.
if (C1 && C2) {
unsigned BitWidth = std::max(C1->getBitWidth(), C2->getBitWidth());
if (C1->getValue().sextOrSelf(BitWidth) ==
return MayAlias;
// Find the last-indexed type of the GEP, i.e., the type you'd get if
// you stripped the last index.
// On the way, look at each indexed type. If there's something other
// than an array, different indices can lead to different final types.
SmallVector<Value *, 8> IntermediateIndices;
// Insert the first index; we don't need to check the type indexed
// through it as it only drops the pointer indirection.
assert(GEP1->getNumIndices() > 1 && "Not enough GEP indices to examine");
// Insert all the remaining indices but the last one.
// Also, check that they all index through arrays.
for (unsigned i = 1, e = GEP1->getNumIndices() - 1; i != e; ++i) {
if (!isa<ArrayType>(GetElementPtrInst::getIndexedType(
GEP1->getSourceElementType(), IntermediateIndices)))
return MayAlias;
IntermediateIndices.push_back(GEP1->getOperand(i + 1));
auto *Ty = GetElementPtrInst::getIndexedType(
GEP1->getSourceElementType(), IntermediateIndices);
StructType *LastIndexedStruct = dyn_cast<StructType>(Ty);
if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
// We know that:
// - both GEPs begin indexing from the exact same pointer;
// - the last indices in both GEPs are constants, indexing into a sequential
// type (array or vector);
// - both GEPs only index through arrays prior to that.
// Because array indices greater than the number of elements are valid in
// GEPs, unless we know the intermediate indices are identical between
// GEP1 and GEP2 we cannot guarantee that the last indexed arrays don't
// partially overlap. We also need to check that the loaded size matches
// the element size, otherwise we could still have overlap.
Type *LastElementTy = GetElementPtrInst::getTypeAtIndex(Ty, (uint64_t)0);
const uint64_t ElementSize =
if (V1Size != ElementSize || V2Size != ElementSize)
return MayAlias;
for (unsigned i = 0, e = GEP1->getNumIndices() - 1; i != e; ++i)
if (GEP1->getOperand(i + 1) != GEP2->getOperand(i + 1))
return MayAlias;
// Now we know that the array/pointer that GEP1 indexes into and that
// that GEP2 indexes into must either precisely overlap or be disjoint.
// Because they cannot partially overlap and because fields in an array
// cannot overlap, if we can prove the final indices are different between
// GEP1 and GEP2, we can conclude GEP1 and GEP2 don't alias.
// If the last indices are constants, we've already checked they don't
// equal each other so we can exit early.
if (C1 && C2)
return NoAlias;
Value *GEP1LastIdx = GEP1->getOperand(GEP1->getNumOperands() - 1);
Value *GEP2LastIdx = GEP2->getOperand(GEP2->getNumOperands() - 1);
if (isa<PHINode>(GEP1LastIdx) || isa<PHINode>(GEP2LastIdx)) {
// If one of the indices is a PHI node, be safe and only use
// computeKnownBits so we don't make any assumptions about the
// relationships between the two indices. This is important if we're
// asking about values from different loop iterations. See PR32314.
// TODO: We may be able to change the check so we only do this when
// we definitely looked through a PHINode.
if (GEP1LastIdx != GEP2LastIdx &&
GEP1LastIdx->getType() == GEP2LastIdx->getType()) {
KnownBits Known1 = computeKnownBits(GEP1LastIdx, DL);
KnownBits Known2 = computeKnownBits(GEP2LastIdx, DL);
if (Known1.Zero.intersects(Known2.One) ||
return NoAlias;
} else if (isKnownNonEqual(GEP1LastIdx, GEP2LastIdx, DL))
return NoAlias;
return MayAlias;
} else if (!LastIndexedStruct || !C1 || !C2) {
return MayAlias;
if (C1->getValue().getActiveBits() > 64 ||
C2->getValue().getActiveBits() > 64)
return MayAlias;
// We know that:
// - both GEPs begin indexing from the exact same pointer;
// - the last indices in both GEPs are constants, indexing into a struct;
// - said indices are different, hence, the pointed-to fields are different;
// - both GEPs only index through arrays prior to that.
// This lets us determine that the struct that GEP1 indexes into and the
// struct that GEP2 indexes into must either precisely overlap or be
// completely disjoint. Because they cannot partially overlap, indexing into
// different non-overlapping fields of the struct will never alias.
// Therefore, the only remaining thing needed to show that both GEPs can't
// alias is that the fields are not overlapping.
const StructLayout *SL = DL.getStructLayout(LastIndexedStruct);
const uint64_t StructSize = SL->getSizeInBytes();
const uint64_t V1Off = SL->getElementOffset(C1->getZExtValue());
const uint64_t V2Off = SL->getElementOffset(C2->getZExtValue());
auto EltsDontOverlap = [StructSize](uint64_t V1Off, uint64_t V1Size,
uint64_t V2Off, uint64_t V2Size) {
return V1Off < V2Off && V1Off + V1Size <= V2Off &&
((V2Off + V2Size <= StructSize) ||
(V2Off + V2Size - StructSize <= V1Off));
if (EltsDontOverlap(V1Off, V1Size, V2Off, V2Size) ||
EltsDontOverlap(V2Off, V2Size, V1Off, V1Size))
return NoAlias;
return MayAlias;
// If a we have (a) a GEP and (b) a pointer based on an alloca, and the
// beginning of the object the GEP points would have a negative offset with
// repsect to the alloca, that means the GEP can not alias pointer (b).
// Note that the pointer based on the alloca may not be a GEP. For
// example, it may be the alloca itself.
// The same applies if (b) is based on a GlobalVariable. Note that just being
// based on isIdentifiedObject() is not enough - we need an identified object
// that does not permit access to negative offsets. For example, a negative
// offset from a noalias argument or call can be inbounds w.r.t the actual
// underlying object.
// For example, consider:
// struct { int f0, int f1, ...} foo;
// foo alloca;
// foo* random = bar(alloca);
// int *f0 = &alloca.f0
// int *f1 = &random->f1;
// Which is lowered, approximately, to:
// %alloca = alloca
// %random = call* @random(* %alloca)
// %f0 = getelementptr inbounds %struct,* %alloca, i32 0, i32 0
// %f1 = getelementptr inbounds %struct,* %random, i32 0, i32 1
// Assume %f1 and %f0 alias. Then %f1 would point into the object allocated
// by %alloca. Since the %f1 GEP is inbounds, that means %random must also
// point into the same object. But since %f0 points to the beginning of %alloca,
// the highest %f1 can be is (%alloca + 3). This means %random can not be higher
// than (%alloca - 1), and so is not inbounds, a contradiction.
bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
LocationSize MaybeObjectAccessSize) {
// If the object access size is unknown, or the GEP isn't inbounds, bail.
if (MaybeObjectAccessSize == LocationSize::unknown() || !GEPOp->isInBounds())
return false;
const uint64_t ObjectAccessSize = MaybeObjectAccessSize.getValue();
// We need the object to be an alloca or a globalvariable, and want to know
// the offset of the pointer from the object precisely, so no variable
// indices are allowed.
if (!(isa<AllocaInst>(DecompObject.Base) ||
isa<GlobalVariable>(DecompObject.Base)) ||
return false;
APInt ObjectBaseOffset = DecompObject.StructOffset +
// If the GEP has no variable indices, we know the precise offset
// from the base, then use it. If the GEP has variable indices,
// we can't get exact GEP offset to identify pointer alias. So return
// false in that case.
if (!DecompGEP.VarIndices.empty())
return false;
APInt GEPBaseOffset = DecompGEP.StructOffset;
GEPBaseOffset += DecompGEP.OtherOffset;
return GEPBaseOffset.sge(ObjectBaseOffset + (int64_t)ObjectAccessSize);
/// Provides a bunch of ad-hoc rules to disambiguate a GEP instruction against
/// another pointer.
/// We know that V1 is a GEP, but we don't know anything about V2.
/// UnderlyingV1 is GetUnderlyingObject(GEP1, DL), UnderlyingV2 is the same for
/// V2.
AliasResult BasicAAResult::aliasGEP(
const GEPOperator *GEP1, LocationSize V1Size, const AAMDNodes &V1AAInfo,
const Value *V2, LocationSize V2Size, const AAMDNodes &V2AAInfo,
const Value *UnderlyingV1, const Value *UnderlyingV2, AAQueryInfo &AAQI) {
DecomposedGEP DecompGEP1, DecompGEP2;
unsigned MaxPointerSize = getMaxPointerSize(DL);
DecompGEP1.StructOffset = DecompGEP1.OtherOffset = APInt(MaxPointerSize, 0);
DecompGEP2.StructOffset = DecompGEP2.OtherOffset = APInt(MaxPointerSize, 0);
DecompGEP1.HasCompileTimeConstantScale =
DecompGEP2.HasCompileTimeConstantScale = true;
bool GEP1MaxLookupReached =
DecomposeGEPExpression(GEP1, DecompGEP1, DL, &AC, DT);
bool GEP2MaxLookupReached =
DecomposeGEPExpression(V2, DecompGEP2, DL, &AC, DT);
// Don't attempt to analyze the decomposed GEP if index scale is not a
// compile-time constant.
if (!DecompGEP1.HasCompileTimeConstantScale ||
return MayAlias;
APInt GEP1BaseOffset = DecompGEP1.StructOffset + DecompGEP1.OtherOffset;
APInt GEP2BaseOffset = DecompGEP2.StructOffset + DecompGEP2.OtherOffset;
assert(DecompGEP1.Base == UnderlyingV1 && DecompGEP2.Base == UnderlyingV2 &&
"DecomposeGEPExpression returned a result different from "
// If the GEP's offset relative to its base is such that the base would
// fall below the start of the object underlying V2, then the GEP and V2
// cannot alias.
if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
isGEPBaseAtNegativeOffset(GEP1, DecompGEP1, DecompGEP2, V2Size))
return NoAlias;
// If we have two gep instructions with must-alias or not-alias'ing base
// pointers, figure out if the indexes to the GEP tell us anything about the
// derived pointer.
if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
// Check for the GEP base being at a negative offset, this time in the other
// direction.
if (!GEP1MaxLookupReached && !GEP2MaxLookupReached &&
isGEPBaseAtNegativeOffset(GEP2, DecompGEP2, DecompGEP1, V1Size))
return NoAlias;
// Do the base pointers alias?
AliasResult BaseAlias =
aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(),
UnderlyingV2, LocationSize::unknown(), AAMDNodes(), AAQI);
// Check for geps of non-aliasing underlying pointers where the offsets are
// identical.
if ((BaseAlias == MayAlias) && V1Size == V2Size) {
// Do the base pointers alias assuming type and size.
AliasResult PreciseBaseAlias = aliasCheck(
UnderlyingV1, V1Size, V1AAInfo, UnderlyingV2, V2Size, V2AAInfo, AAQI);
if (PreciseBaseAlias == NoAlias) {
// See if the computed offset from the common pointer tells us about the
// relation of the resulting pointer.
// If the max search depth is reached the result is undefined
if (GEP2MaxLookupReached || GEP1MaxLookupReached)
return MayAlias;
// Same offsets.
if (GEP1BaseOffset == GEP2BaseOffset &&
DecompGEP1.VarIndices == DecompGEP2.VarIndices)
return NoAlias;
// If we get a No or May, then return it immediately, no amount of analysis
// will improve this situation.
if (BaseAlias != MustAlias) {
assert(BaseAlias == NoAlias || BaseAlias == MayAlias);
return BaseAlias;
// Otherwise, we have a MustAlias. Since the base pointers alias each other
// exactly, see if the computed offset from the common pointer tells us
// about the relation of the resulting pointer.
// If we know the two GEPs are based off of the exact same pointer (and not
// just the same underlying object), see if that tells us anything about
// the resulting pointers.
if (GEP1->getPointerOperand()->stripPointerCastsAndInvariantGroups() ==
GEP2->getPointerOperand()->stripPointerCastsAndInvariantGroups() &&
GEP1->getPointerOperandType() == GEP2->getPointerOperandType()) {
AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL);
// If we couldn't find anything interesting, don't abandon just yet.
if (R != MayAlias)
return R;
// If the max search depth is reached, the result is undefined
if (GEP2MaxLookupReached || GEP1MaxLookupReached)
return MayAlias;
// Subtract the GEP2 pointer from the GEP1 pointer to find out their
// symbolic difference.
GEP1BaseOffset -= GEP2BaseOffset;
GetIndexDifference(DecompGEP1.VarIndices, DecompGEP2.VarIndices);
} else {
// Check to see if these two pointers are related by the getelementptr
// instruction. If one pointer is a GEP with a non-zero index of the other
// pointer, we know they cannot alias.
// If both accesses are unknown size, we can't do anything useful here.
if (V1Size == LocationSize::unknown() && V2Size == LocationSize::unknown())
return MayAlias;
AliasResult R = aliasCheck(UnderlyingV1, LocationSize::unknown(),
AAMDNodes(), V2, LocationSize::unknown(),
V2AAInfo, AAQI, nullptr, UnderlyingV2);
if (R != MustAlias) {
// If V2 may alias GEP base pointer, conservatively returns MayAlias.
// If V2 is known not to alias GEP base pointer, then the two values
// cannot alias per GEP semantics: "Any memory access must be done through
// a pointer value associated with an address range of the memory access,
// otherwise the behavior is undefined.".
assert(R == NoAlias || R == MayAlias);
return R;
// If the max search depth is reached the result is undefined
if (GEP1MaxLookupReached)
return MayAlias;
// In the two GEP Case, if there is no difference in the offsets of the
// computed pointers, the resultant pointers are a must alias. This
// happens when we have two lexically identical GEP's (for example).
// In the other case, if we have getelementptr <ptr>, 0, 0, 0, 0, ... and V2
// must aliases the GEP, the end result is a must alias also.
if (GEP1BaseOffset == 0 && DecompGEP1.VarIndices.empty())
return MustAlias;
// If there is a constant difference between the pointers, but the difference
// is less than the size of the associated memory object, then we know
// that the objects are partially overlapping. If the difference is
// greater, we know they do not overlap.
if (GEP1BaseOffset != 0 && DecompGEP1.VarIndices.empty()) {
if (GEP1BaseOffset.sge(0)) {
if (V2Size != LocationSize::unknown()) {
if (GEP1BaseOffset.ult(V2Size.getValue()))
return PartialAlias;
return NoAlias;
} else {
// We have the situation where:
// + +
// | BaseOffset |
// ---------------->|
// |-->V1Size |-------> V2Size
// GEP1 V2
// We need to know that V2Size is not unknown, otherwise we might have
// stripped a gep with negative index ('gep <ptr>, -1, ...).
if (V1Size != LocationSize::unknown() &&
V2Size != LocationSize::unknown()) {
if ((-GEP1BaseOffset).ult(V1Size.getValue()))
return PartialAlias;
return NoAlias;
if (!DecompGEP1.VarIndices.empty()) {
APInt Modulo(MaxPointerSize, 0);
bool AllPositive = true;
for (unsigned i = 0, e = DecompGEP1.VarIndices.size(); i != e; ++i) {
// Try to distinguish something like &A[i][1] against &A[42][0].
// Grab the least significant bit set in any of the scales. We
// don't need std::abs here (even if the scale's negative) as we'll
// be ^'ing Modulo with itself later.
Modulo |= DecompGEP1.VarIndices[i].Scale;
if (AllPositive) {
// If the Value could change between cycles, then any reasoning about
// the Value this cycle may not hold in the next cycle. We'll just
// give up if we can't determine conditions that hold for every cycle:
const Value *V = DecompGEP1.VarIndices[i].V;
KnownBits Known =
computeKnownBits(V, DL, 0, &AC, dyn_cast<Instruction>(GEP1), DT);
bool SignKnownZero = Known.isNonNegative();
bool SignKnownOne = Known.isNegative();
// Zero-extension widens the variable, and so forces the sign
// bit to zero.
bool IsZExt = DecompGEP1.VarIndices[i].ZExtBits > 0 || isa<ZExtInst>(V);
SignKnownZero |= IsZExt;
SignKnownOne &= !IsZExt;
// If the variable begins with a zero then we know it's
// positive, regardless of whether the value is signed or
// unsigned.
APInt Scale = DecompGEP1.VarIndices[i].Scale;
AllPositive =
(SignKnownZero && Scale.sge(0)) || (SignKnownOne && Scale.slt(0));
Modulo = Modulo ^ (Modulo & (Modulo - 1));
// We can compute the difference between the two addresses
// mod Modulo. Check whether that difference guarantees that the
// two locations do not alias.
APInt ModOffset = GEP1BaseOffset & (Modulo - 1);
if (V1Size != LocationSize::unknown() &&
V2Size != LocationSize::unknown() && ModOffset.uge(V2Size.getValue()) &&
(Modulo - ModOffset).uge(V1Size.getValue()))
return NoAlias;
// If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
// If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
// don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
if (AllPositive && GEP1BaseOffset.sgt(0) &&
V2Size != LocationSize::unknown() &&
return NoAlias;
if (constantOffsetHeuristic(DecompGEP1.VarIndices, V1Size, V2Size,
GEP1BaseOffset, &AC, DT))
return NoAlias;
// Statically, we can see that the base objects are the same, but the
// pointers have dynamic offsets which we can't resolve. And none of our
// little tricks above worked.
return MayAlias;
static AliasResult MergeAliasResults(AliasResult A, AliasResult B) {
// If the results agree, take it.
if (A == B)
return A;
// A mix of PartialAlias and MustAlias is PartialAlias.
if ((A == PartialAlias && B == MustAlias) ||
(B == PartialAlias && A == MustAlias))
return PartialAlias;
// Otherwise, we don't know anything.
return MayAlias;
/// Provides a bunch of ad-hoc rules to disambiguate a Select instruction
/// against another.
BasicAAResult::aliasSelect(const SelectInst *SI, LocationSize SISize,
const AAMDNodes &SIAAInfo, const Value *V2,
LocationSize V2Size, const AAMDNodes &V2AAInfo,
const Value *UnderV2, AAQueryInfo &AAQI) {
// If the values are Selects with the same condition, we can do a more precise
// check: just check for aliases between the values on corresponding arms.
if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
if (SI->getCondition() == SI2->getCondition()) {
AliasResult Alias =
aliasCheck(SI->getTrueValue(), SISize, SIAAInfo, SI2->getTrueValue(),
V2Size, V2AAInfo, AAQI);
if (Alias == MayAlias)
return MayAlias;
AliasResult ThisAlias =
aliasCheck(SI->getFalseValue(), SISize, SIAAInfo,
SI2->getFalseValue(), V2Size, V2AAInfo, AAQI);
return MergeAliasResults(ThisAlias, Alias);
// If both arms of the Select node NoAlias or MustAlias V2, then returns
// NoAlias / MustAlias. Otherwise, returns MayAlias.
AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(),
SISize, SIAAInfo, AAQI, UnderV2);
if (Alias == MayAlias)
return MayAlias;
AliasResult ThisAlias = aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(),
SISize, SIAAInfo, AAQI, UnderV2);
return MergeAliasResults(ThisAlias, Alias);
/// Provide a bunch of ad-hoc rules to disambiguate a PHI instruction against
/// another.
AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
const AAMDNodes &PNAAInfo, const Value *V2,
LocationSize V2Size,
const AAMDNodes &V2AAInfo,
const Value *UnderV2, AAQueryInfo &AAQI) {
// Track phi nodes we have visited. We use this information when we determine
// value equivalence.
// If the values are PHIs in the same block, we can do a more precise
// as well as efficient check: just check for aliases between the values
// on corresponding edges.
if (const PHINode *PN2 = dyn_cast<PHINode>(V2))
if (PN2->getParent() == PN->getParent()) {
AAQueryInfo::LocPair Locs(MemoryLocation(PN, PNSize, PNAAInfo),
MemoryLocation(V2, V2Size, V2AAInfo));
if (PN > V2)
std::swap(Locs.first, Locs.second);
// Analyse the PHIs' inputs under the assumption that the PHIs are
// NoAlias.
// If the PHIs are May/MustAlias there must be (recursively) an input
// operand from outside the PHIs' cycle that is MayAlias/MustAlias or
// there must be an operation on the PHIs within the PHIs' value cycle
// that causes a MayAlias.
// Pretend the phis do not alias.
AliasResult Alias = NoAlias;
AliasResult OrigAliasResult;
// Limited lifetime iterator invalidated by the aliasCheck call below.
auto CacheIt = AAQI.AliasCache.find(Locs);
assert((CacheIt != AAQI.AliasCache.end()) &&
"There must exist an entry for the phi node");
OrigAliasResult = CacheIt->second;
CacheIt->second = NoAlias;
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
AliasResult ThisAlias =
aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo,
V2Size, V2AAInfo, AAQI);
Alias = MergeAliasResults(ThisAlias, Alias);
if (Alias == MayAlias)
// Reset if speculation failed.
if (Alias != NoAlias) {
auto Pair =
AAQI.AliasCache.insert(std::make_pair(Locs, OrigAliasResult));
assert(!Pair.second && "Entry must have existed");
Pair.first->second = OrigAliasResult;
return Alias;
SmallVector<Value *, 4> V1Srcs;
+ // For a recursive phi, that recurses through a contant gep, we can perform
+ // aliasing calculations using the other phi operands with an unknown size to
+ // specify that an unknown number of elements after the initial value are
+ // potentially accessed.
bool isRecursive = false;
- if (PV) {
+ auto CheckForRecPhi = [&](Value *PV) {
+ if (!EnableRecPhiAnalysis)
+ return false;
+ if (GEPOperator *PVGEP = dyn_cast<GEPOperator>(PV)) {
+ // Check whether the incoming value is a GEP that advances the pointer
+ // result of this PHI node (e.g. in a loop). If this is the case, we
+ // would recurse and always get a MayAlias. Handle this case specially
+ // below. We need to ensure that the phi is inbounds and has a constant
+ // positive operand so that we can check for alias with the initial value
+ // and an unknown but positive size.
+ if (PVGEP->getPointerOperand() == PN && PVGEP->isInBounds() &&
+ PVGEP->getNumIndices() == 1 && isa<ConstantInt>(PVGEP->idx_begin()) &&
+ !cast<ConstantInt>(PVGEP->idx_begin())->isNegative()) {
+ isRecursive = true;
+ return true;
+ }
+ }
+ return false;
+ };
+ if (PV) {
// If we have PhiValues then use it to get the underlying phi values.
const PhiValues::ValueSet &PhiValueSet = PV->getValuesForPhi(PN);
// If we have more phi values than the search depth then return MayAlias
// conservatively to avoid compile time explosion. The worst possible case
// is if both sides are PHI nodes. In which case, this is O(m x n) time
// where 'm' and 'n' are the number of PHI sources.
if (PhiValueSet.size() > MaxLookupSearchDepth)
return MayAlias;
// Add the values to V1Srcs
for (Value *PV1 : PhiValueSet) {
- if (EnableRecPhiAnalysis) {
- if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
- // Check whether the incoming value is a GEP that advances the pointer
- // result of this PHI node (e.g. in a loop). If this is the case, we
- // would recurse and always get a MayAlias. Handle this case specially
- // below.
- if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
- isa<ConstantInt>(PV1GEP->idx_begin())) {
- isRecursive = true;
- continue;
- }
- }
- }
+ if (CheckForRecPhi(PV1))
+ continue;
} else {
// If we don't have PhiInfo then just look at the operands of the phi itself
// FIXME: Remove this once we can guarantee that we have PhiInfo always
SmallPtrSet<Value *, 4> UniqueSrc;
for (Value *PV1 : PN->incoming_values()) {
if (isa<PHINode>(PV1))
// If any of the source itself is a PHI, return MayAlias conservatively
// to avoid compile time explosion. The worst possible case is if both
// sides are PHI nodes. In which case, this is O(m x n) time where 'm'
// and 'n' are the number of PHI sources.
return MayAlias;
- if (EnableRecPhiAnalysis)
- if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
- // Check whether the incoming value is a GEP that advances the pointer
- // result of this PHI node (e.g. in a loop). If this is the case, we
- // would recurse and always get a MayAlias. Handle this case specially
- // below.
- if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
- isa<ConstantInt>(PV1GEP->idx_begin())) {
- isRecursive = true;
- continue;
- }
- }
+ if (CheckForRecPhi(PV1))
+ continue;
if (UniqueSrc.insert(PV1).second)
// If V1Srcs is empty then that means that the phi has no underlying non-phi
// value. This should only be possible in blocks unreachable from the entry
// block, but return MayAlias just in case.
if (V1Srcs.empty())
return MayAlias;
// If this PHI node is recursive, set the size of the accessed memory to
// unknown to represent all the possible values the GEP could advance the
// pointer to.
if (isRecursive)
PNSize = LocationSize::unknown();
AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0], PNSize,
PNAAInfo, AAQI, UnderV2);
// Early exit if the check of the first PHI source against V2 is MayAlias.
// Other results are not possible.
if (Alias == MayAlias)
return MayAlias;
// With recursive phis we cannot guarantee that MustAlias/PartialAlias will
// remain valid to all elements and needs to conservatively return MayAlias.
if (isRecursive && Alias != NoAlias)
return MayAlias;
// If all sources of the PHI node NoAlias or MustAlias V2, then returns
// NoAlias / MustAlias. Otherwise, returns MayAlias.
for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
Value *V = V1Srcs[i];
AliasResult ThisAlias =
aliasCheck(V2, V2Size, V2AAInfo, V, PNSize, PNAAInfo, AAQI, UnderV2);
Alias = MergeAliasResults(ThisAlias, Alias);
if (Alias == MayAlias)
return Alias;
/// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as
/// array references.
AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
AAMDNodes V1AAInfo, const Value *V2,
LocationSize V2Size, AAMDNodes V2AAInfo,
AAQueryInfo &AAQI, const Value *O1,
const Value *O2) {
// If either of the memory references is empty, it doesn't matter what the
// pointer values are.
if (V1Size.isZero() || V2Size.isZero())
return NoAlias;
// Strip off any casts if they exist.
V1 = V1->stripPointerCastsAndInvariantGroups();
V2 = V2->stripPointerCastsAndInvariantGroups();
// If V1 or V2 is undef, the result is NoAlias because we can always pick a
// value for undef that aliases nothing in the program.
if (isa<UndefValue>(V1) || isa<UndefValue>(V2))
return NoAlias;
// Are we checking for alias of the same value?
// Because we look 'through' phi nodes, we could look at "Value" pointers from
// different iterations. We must therefore make sure that this is not the
// case. The function isValueEqualInPotentialCycles ensures that this cannot
// happen by looking at the visited phi nodes and making sure they cannot
// reach the value.
if (isValueEqualInPotentialCycles(V1, V2))
return MustAlias;
if (!V1->getType()->isPointerTy() || !V2->getType()->isPointerTy())
return NoAlias; // Scalars cannot alias each other
// Figure out what objects these things are pointing to if we can.
if (O1 == nullptr)
O1 = GetUnderlyingObject(V1, DL, MaxLookupSearchDepth);
if (O2 == nullptr)
O2 = GetUnderlyingObject(V2, DL, MaxLookupSearchDepth);
// Null values in the default address space don't point to any object, so they
// don't alias any other pointer.
if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O1))
if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace()))
return NoAlias;
if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O2))
if (!NullPointerIsDefined(&F, CPN->getType()->getAddressSpace()))
return NoAlias;
if (O1 != O2) {
// If V1/V2 point to two different objects, we know that we have no alias.
if (isIdentifiedObject(O1) && isIdentifiedObject(O2))
return NoAlias;
// Constant pointers can't alias with non-const isIdentifiedObject objects.
if ((isa<Constant>(O1) && isIdentifiedObject(O2) && !isa<Constant>(O2)) ||
(isa<Constant>(O2) && isIdentifiedObject(O1) && !isa<Constant>(O1)))
return NoAlias;
// Function arguments can't alias with things that are known to be
// unambigously identified at the function level.
if ((isa<Argument>(O1) && isIdentifiedFunctionLocal(O2)) ||
(isa<Argument>(O2) && isIdentifiedFunctionLocal(O1)))
return NoAlias;
// If one pointer is the result of a call/invoke or load and the other is a
// non-escaping local object within the same function, then we know the
// object couldn't escape to a point where the call could return it.
// Note that if the pointers are in different functions, there are a
// variety of complications. A call with a nocapture argument may still
// temporary store the nocapture argument's value in a temporary memory
// location if that memory location doesn't escape. Or it may pass a
// nocapture value to other functions as long as they don't capture it.
if (isEscapeSource(O1) &&
isNonEscapingLocalObject(O2, &AAQI.IsCapturedCache))
return NoAlias;
if (isEscapeSource(O2) &&
isNonEscapingLocalObject(O1, &AAQI.IsCapturedCache))
return NoAlias;
// If the size of one access is larger than the entire object on the other
// side, then we know such behavior is undefined and can assume no alias.
bool NullIsValidLocation = NullPointerIsDefined(&F);
if ((isObjectSmallerThan(
O2, getMinimalExtentFrom(*V1, V1Size, DL, NullIsValidLocation), DL,
TLI, NullIsValidLocation)) ||
O1, getMinimalExtentFrom(*V2, V2Size, DL, NullIsValidLocation), DL,
TLI, NullIsValidLocation)))
return NoAlias;
// Check the cache before climbing up use-def chains. This also terminates
// otherwise infinitely recursive queries.
AAQueryInfo::LocPair Locs(MemoryLocation(V1, V1Size, V1AAInfo),
MemoryLocation(V2, V2Size, V2AAInfo));
if (V1 > V2)
std::swap(Locs.first, Locs.second);
std::pair<AAQueryInfo::AliasCacheT::iterator, bool> Pair =
AAQI.AliasCache.try_emplace(Locs, MayAlias);
if (!Pair.second)
return Pair.first->second;
// FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
// GEP can't simplify, we don't even look at the PHI cases.
if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
std::swap(V1, V2);
std::swap(V1Size, V2Size);
std::swap(O1, O2);
std::swap(V1AAInfo, V2AAInfo);
if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
AliasResult Result =
aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2, AAQI);
if (Result != MayAlias) {
auto ItInsPair = AAQI.AliasCache.insert(std::make_pair(Locs, Result));
assert(!ItInsPair.second && "Entry must have existed");
ItInsPair.first->second = Result;
return Result;
if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
std::swap(V1, V2);
std::swap(O1, O2);
std::swap(V1Size, V2Size);
std::swap(V1AAInfo, V2AAInfo);
if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
AliasResult Result =
aliasPHI(PN, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O2, AAQI);
if (Result != MayAlias) {
Pair = AAQI.AliasCache.try_emplace(Locs, Result);
assert(!Pair.second && "Entry must have existed");
return Pair.first->second = Result;
if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
std::swap(V1, V2);
std::swap(O1, O2);
std::swap(V1Size, V2Size);
std::swap(V1AAInfo, V2AAInfo);
if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
AliasResult Result =
aliasSelect(S1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O2, AAQI);
if (Result != MayAlias) {
Pair = AAQI.AliasCache.try_emplace(Locs, Result);
assert(!Pair.second && "Entry must have existed");
return Pair.first->second = Result;
// If both pointers are pointing into the same object and one of them
// accesses the entire object, then the accesses must overlap in some way.
if (O1 == O2)
if (V1Size.isPrecise() && V2Size.isPrecise() &&
(isObjectSize(O1, V1Size.getValue(), DL, TLI, NullIsValidLocation) ||
isObjectSize(O2, V2Size.getValue(), DL, TLI, NullIsValidLocation))) {
Pair = AAQI.AliasCache.try_emplace(Locs, PartialAlias);
assert(!Pair.second && "Entry must have existed");
return Pair.first->second = PartialAlias;
// Recurse back into the best AA results we have, potentially with refined
// memory locations. We have already ensured that BasicAA has a MayAlias
// cache result for these, so any recursion back into BasicAA won't loop.
AliasResult Result = getBestAAResults().alias(Locs.first, Locs.second, AAQI);
Pair = AAQI.AliasCache.try_emplace(Locs, Result);
assert(!Pair.second && "Entry must have existed");
return Pair.first->second = Result;
/// Check whether two Values can be considered equivalent.
/// In addition to pointer equivalence of \p V1 and \p V2 this checks whether
/// they can not be part of a cycle in the value graph by looking at all
/// visited phi nodes an making sure that the phis cannot reach the value. We
/// have to do this because we are looking through phi nodes (That is we say
/// noalias(V, phi(VA, VB)) if noalias(V, VA) and noalias(V, VB).
bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V,
const Value *V2) {
if (V != V2)
return false;
const Instruction *Inst = dyn_cast<Instruction>(V);
if (!Inst)
return true;
if (VisitedPhiBBs.empty())
return true;
if (VisitedPhiBBs.size() > MaxNumPhiBBsValueReachabilityCheck)
return false;
// Make sure that the visited phis cannot reach the Value. This ensures that
// the Values cannot come from different iterations of a potential cycle the
// phi nodes could be involved in.
for (auto *P : VisitedPhiBBs)
if (isPotentiallyReachable(&P->front(), Inst, nullptr, DT, LI))
return false;
return true;
/// Computes the symbolic difference between two de-composed GEPs.
/// Dest and Src are the variable indices from two decomposed GetElementPtr
/// instructions GEP1 and GEP2 which have common base pointers.
void BasicAAResult::GetIndexDifference(
SmallVectorImpl<VariableGEPIndex> &Dest,
const SmallVectorImpl<VariableGEPIndex> &Src) {
if (Src.empty())
for (unsigned i = 0, e = Src.size(); i != e; ++i) {
const Value *V = Src[i].V;
unsigned ZExtBits = Src[i].ZExtBits, SExtBits = Src[i].SExtBits;
APInt Scale = Src[i].Scale;
// Find V in Dest. This is N^2, but pointer indices almost never have more
// than a few variable indexes.
for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
if (!isValueEqualInPotentialCycles(Dest[j].V, V) ||
Dest[j].ZExtBits != ZExtBits || Dest[j].SExtBits != SExtBits)
// If we found it, subtract off Scale V's from the entry in Dest. If it
// goes to zero, remove the entry.
if (Dest[j].Scale != Scale)
Dest[j].Scale -= Scale;
Dest.erase(Dest.begin() + j);
Scale = 0;
// If we didn't consume this entry, add it to the end of the Dest list.
if (!!Scale) {
VariableGEPIndex Entry = {V, ZExtBits, SExtBits, -Scale};
bool BasicAAResult::constantOffsetHeuristic(
const SmallVectorImpl<VariableGEPIndex> &VarIndices,
LocationSize MaybeV1Size, LocationSize MaybeV2Size, const APInt &BaseOffset,
AssumptionCache *AC, DominatorTree *DT) {
if (VarIndices.size() != 2 || MaybeV1Size == LocationSize::unknown() ||
MaybeV2Size == LocationSize::unknown())
return false;
const uint64_t V1Size = MaybeV1Size.getValue();
const uint64_t V2Size = MaybeV2Size.getValue();
const VariableGEPIndex &Var0 = VarIndices[0], &Var1 = VarIndices[1];
if (Var0.ZExtBits != Var1.ZExtBits || Var0.SExtBits != Var1.SExtBits ||
Var0.Scale != -Var1.Scale)
return false;
unsigned Width = Var1.V->getType()->getIntegerBitWidth();
// We'll strip off the Extensions of Var0 and Var1 and do another round
// of GetLinearExpression decomposition. In the example above, if Var0
// is zext(%x + 1) we should get V1 == %x and V1Offset == 1.
APInt V0Scale(Width, 0), V0Offset(Width, 0), V1Scale(Width, 0),
V1Offset(Width, 0);
bool NSW = true, NUW = true;
unsigned V0ZExtBits = 0, V0SExtBits = 0, V1ZExtBits = 0, V1SExtBits = 0;
const Value *V0 = GetLinearExpression(Var0.V, V0Scale, V0Offset, V0ZExtBits,
V0SExtBits, DL, 0, AC, DT, NSW, NUW);
NSW = true;
NUW = true;
const Value *V1 = GetLinearExpression(Var1.V, V1Scale, V1Offset, V1ZExtBits,
V1SExtBits, DL, 0, AC, DT, NSW, NUW);
if (V0Scale != V1Scale || V0ZExtBits != V1ZExtBits ||
V0SExtBits != V1SExtBits || !isValueEqualInPotentialCycles(V0, V1))
return false;
// We have a hit - Var0 and Var1 only differ by a constant offset!
// If we've been sext'ed then zext'd the maximum difference between Var0 and
// Var1 is possible to calculate, but we're just interested in the absolute
// minimum difference between the two. The minimum distance may occur due to
// wrapping; consider "add i3 %i, 5": if %i == 7 then 7 + 5 mod 8 == 4, and so
// the minimum distance between %i and %i + 5 is 3.
APInt MinDiff = V0Offset - V1Offset, Wrapped = -MinDiff;
MinDiff = APIntOps::umin(MinDiff, Wrapped);
APInt MinDiffBytes =
MinDiff.zextOrTrunc(Var0.Scale.getBitWidth()) * Var0.Scale.abs();
// We can't definitely say whether GEP1 is before or after V2 due to wrapping
// arithmetic (i.e. for some values of GEP1 and V2 GEP1 < V2, and for other
// values GEP1 > V2). We'll therefore only declare NoAlias if both V1Size and
// V2Size can fit in the MinDiffBytes gap.
return MinDiffBytes.uge(V1Size + BaseOffset.abs()) &&
MinDiffBytes.uge(V2Size + BaseOffset.abs());
// BasicAliasAnalysis Pass
AnalysisKey BasicAA::Key;
BasicAAResult BasicAA::run(Function &F, FunctionAnalysisManager &AM) {
return BasicAAResult(F.getParent()->getDataLayout(),
BasicAAWrapperPass::BasicAAWrapperPass() : FunctionPass(ID) {
char BasicAAWrapperPass::ID = 0;
void BasicAAWrapperPass::anchor() {}
INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basic-aa",
"Basic Alias Analysis (stateless AA impl)", true, true)
INITIALIZE_PASS_END(BasicAAWrapperPass, "basic-aa",
"Basic Alias Analysis (stateless AA impl)", true, true)
FunctionPass *llvm::createBasicAAWrapperPass() {
return new BasicAAWrapperPass();
bool BasicAAWrapperPass::runOnFunction(Function &F) {
auto &ACT = getAnalysis<AssumptionCacheTracker>();
auto &TLIWP = getAnalysis<TargetLibraryInfoWrapperPass>();
auto &DTWP = getAnalysis<DominatorTreeWrapperPass>();
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
auto *PVWP = getAnalysisIfAvailable<PhiValuesWrapperPass>();
Result.reset(new BasicAAResult(F.getParent()->getDataLayout(), F,
TLIWP.getTLI(F), ACT.getAssumptionCache(F),
LIWP ? &LIWP->getLoopInfo() : nullptr,
PVWP ? &PVWP->getResult() : nullptr));
return false;
void BasicAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
BasicAAResult llvm::createLegacyPMBasicAAResult(Pass &P, Function &F) {
return BasicAAResult(
F.getParent()->getDataLayout(), F,
diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 6c5ef0255a08..204fb556d810 100644
--- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -1,444 +1,448 @@
//===- LocalStackSlotAllocation.cpp - Pre-allocate locals to stack slots --===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This pass assigns local frame indices to stack slots relative to one another
// and allocates additional base registers to access them when the target
// estimates they are likely to be out of range of stack pointer and frame
// pointer relative addressing.
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <tuple>
using namespace llvm;
#define DEBUG_TYPE "localstackalloc"
STATISTIC(NumAllocations, "Number of frame indices allocated into local block");
STATISTIC(NumBaseRegisters, "Number of virtual frame base registers allocated");
STATISTIC(NumReplacements, "Number of frame indices references replaced");
namespace {
class FrameRef {
MachineBasicBlock::iterator MI; // Instr referencing the frame
int64_t LocalOffset; // Local offset of the frame idx referenced
int FrameIdx; // The frame index
// Order reference instruction appears in program. Used to ensure
// deterministic order when multiple instructions may reference the same
// location.
unsigned Order;
FrameRef(MachineInstr *I, int64_t Offset, int Idx, unsigned Ord) :
MI(I), LocalOffset(Offset), FrameIdx(Idx), Order(Ord) {}
bool operator<(const FrameRef &RHS) const {
return std::tie(LocalOffset, FrameIdx, Order) <
std::tie(RHS.LocalOffset, RHS.FrameIdx, RHS.Order);
MachineBasicBlock::iterator getMachineInstr() const { return MI; }
int64_t getLocalOffset() const { return LocalOffset; }
int getFrameIndex() const { return FrameIdx; }
class LocalStackSlotPass: public MachineFunctionPass {
SmallVector<int64_t, 16> LocalOffsets;
/// StackObjSet - A set of stack object indexes
using StackObjSet = SmallSetVector<int, 8>;
void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, int64_t &Offset,
bool StackGrowsDown, Align &MaxAlign);
void AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
SmallSet<int, 16> &ProtectedObjs,
MachineFrameInfo &MFI, bool StackGrowsDown,
int64_t &Offset, Align &MaxAlign);
void calculateFrameObjectOffsets(MachineFunction &Fn);
bool insertFrameReferenceRegisters(MachineFunction &Fn);
static char ID; // Pass identification, replacement for typeid
explicit LocalStackSlotPass() : MachineFunctionPass(ID) {
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
} // end anonymous namespace
char LocalStackSlotPass::ID = 0;
char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID;
"Local Stack Slot Allocation", false, false)
bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
unsigned LocalObjectCount = MFI.getObjectIndexEnd();
// If the target doesn't want/need this pass, or if there are no locals
// to consider, early exit.
if (!TRI->requiresVirtualBaseRegisters(MF) || LocalObjectCount == 0)
return true;
// Make sure we have enough space to store the local offsets.
// Lay out the local blob.
// Insert virtual base registers to resolve frame index references.
bool UsedBaseRegs = insertFrameReferenceRegisters(MF);
// Tell MFI whether any base registers were allocated. PEI will only
// want to use the local block allocations from this pass if there were any.
// Otherwise, PEI can do a bit better job of getting the alignment right
// without a hole at the start since it knows the alignment of the stack
// at the start of local allocation, and this pass doesn't.
return true;
/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
int64_t &Offset, bool StackGrowsDown,
Align &MaxAlign) {
// If the stack grows down, add the object size to find the lowest address.
if (StackGrowsDown)
Offset += MFI.getObjectSize(FrameIdx);
Align Alignment = MFI.getObjectAlign(FrameIdx);
// If the alignment of this object is greater than that of the stack, then
// increase the stack alignment to match.
MaxAlign = std::max(MaxAlign, Alignment);
// Adjust to alignment boundary.
Offset = alignTo(Offset, Alignment);
int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
LLVM_DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
<< LocalOffset << "\n");
// Keep the offset available for base register allocation
LocalOffsets[FrameIdx] = LocalOffset;
// And tell MFI about it for PEI to use later
MFI.mapLocalFrameObject(FrameIdx, LocalOffset);
if (!StackGrowsDown)
Offset += MFI.getObjectSize(FrameIdx);
/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
/// those required to be close to the Stack Protector) to stack offsets.
void LocalStackSlotPass::AssignProtectedObjSet(
const StackObjSet &UnassignedObjs, SmallSet<int, 16> &ProtectedObjs,
MachineFrameInfo &MFI, bool StackGrowsDown, int64_t &Offset,
Align &MaxAlign) {
for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
E = UnassignedObjs.end(); I != E; ++I) {
int i = *I;
AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
/// abstract stack objects.
void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
// Loop over all of the stack objects, assigning sequential addresses...
MachineFrameInfo &MFI = Fn.getFrameInfo();
const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
bool StackGrowsDown =
TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
int64_t Offset = 0;
Align MaxAlign;
// Make sure that the stack protector comes before the local variables on the
// stack.
SmallSet<int, 16> ProtectedObjs;
if (MFI.hasStackProtectorIndex()) {
int StackProtectorFI = MFI.getStackProtectorIndex();
// We need to make sure we didn't pre-allocate the stack protector when
// doing this.
// If we already have a stack protector, this will re-assign it to a slot
// that is **not** covering the protected objects.
assert(!MFI.isObjectPreAllocated(StackProtectorFI) &&
"Stack protector pre-allocated in LocalStackSlotAllocation");
StackObjSet LargeArrayObjs;
StackObjSet SmallArrayObjs;
StackObjSet AddrOfObjs;
AdjustStackOffset(MFI, StackProtectorFI, Offset, StackGrowsDown, MaxAlign);
// Assign large stack objects first.
for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
if (MFI.isDeadObjectIndex(i))
if (StackProtectorFI == (int)i)
+ if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i)))
+ continue;
switch (MFI.getObjectSSPLayout(i)) {
case MachineFrameInfo::SSPLK_None:
case MachineFrameInfo::SSPLK_SmallArray:
case MachineFrameInfo::SSPLK_AddrOf:
case MachineFrameInfo::SSPLK_LargeArray:
llvm_unreachable("Unexpected SSPLayoutKind.");
AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
Offset, MaxAlign);
AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
Offset, MaxAlign);
AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
Offset, MaxAlign);
// Then assign frame offsets to stack objects that are not used to spill
// callee saved registers.
for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
if (MFI.isDeadObjectIndex(i))
if (MFI.getStackProtectorIndex() == (int)i)
if (ProtectedObjs.count(i))
+ if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i)))
+ continue;
AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
// Remember how big this blob of stack space is
static inline bool
lookupCandidateBaseReg(unsigned BaseReg,
int64_t BaseOffset,
int64_t FrameSizeAdjust,
int64_t LocalFrameOffset,
const MachineInstr &MI,
const TargetRegisterInfo *TRI) {
// Check if the relative offset from the where the base register references
// to the target address is in range for the instruction.
int64_t Offset = FrameSizeAdjust + LocalFrameOffset - BaseOffset;
return TRI->isFrameOffsetLegal(&MI, BaseReg, Offset);
bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
// Scan the function's instructions looking for frame index references.
// For each, ask the target if it wants a virtual base register for it
// based on what we can tell it about where the local will end up in the
// stack frame. If it wants one, re-use a suitable one we've previously
// allocated, or if there isn't one that fits the bill, allocate a new one
// and ask the target to create a defining instruction for it.
bool UsedBaseReg = false;
MachineFrameInfo &MFI = Fn.getFrameInfo();
const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
bool StackGrowsDown =
TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
// Collect all of the instructions in the block that reference
// a frame index. Also store the frame index referenced to ease later
// lookup. (For any insn that has more than one FI reference, we arbitrarily
// choose the first one).
SmallVector<FrameRef, 64> FrameReferenceInsns;
unsigned Order = 0;
for (MachineBasicBlock &BB : Fn) {
for (MachineInstr &MI : BB) {
// Debug value, stackmap and patchpoint instructions can't be out of
// range, so they don't need any updates.
if (MI.isDebugInstr() || MI.getOpcode() == TargetOpcode::STATEPOINT ||
MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::PATCHPOINT)
// For now, allocate the base register(s) within the basic block
// where they're used, and don't try to keep them around outside
// of that. It may be beneficial to try sharing them more broadly
// than that, but the increased register pressure makes that a
// tricky thing to balance. Investigate if re-materializing these
// becomes an issue.
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
// Consider replacing all frame index operands that reference
// an object allocated in the local block.
if (MI.getOperand(i).isFI()) {
// Don't try this with values not in the local block.
if (!MFI.isObjectPreAllocated(MI.getOperand(i).getIndex()))
int Idx = MI.getOperand(i).getIndex();
int64_t LocalOffset = LocalOffsets[Idx];
if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
FrameReferenceInsns.push_back(FrameRef(&MI, LocalOffset, Idx, Order++));
// Sort the frame references by local offset.
// Use frame index as a tie-breaker in case MI's have the same offset.
MachineBasicBlock *Entry = &Fn.front();
unsigned BaseReg = 0;
int64_t BaseOffset = 0;
// Loop through the frame references and allocate for them as necessary.
for (int ref = 0, e = FrameReferenceInsns.size(); ref < e ; ++ref) {
FrameRef &FR = FrameReferenceInsns[ref];
MachineInstr &MI = *FR.getMachineInstr();
int64_t LocalOffset = FR.getLocalOffset();
int FrameIdx = FR.getFrameIndex();
assert(MFI.isObjectPreAllocated(FrameIdx) &&
"Only pre-allocated locals expected!");
// We need to keep the references to the stack protector slot through frame
// index operands so that it gets resolved by PEI rather than this pass.
// This avoids accesses to the stack protector though virtual base
// registers, and forces PEI to address it using fp/sp/bp.
if (MFI.hasStackProtectorIndex() &&
FrameIdx == MFI.getStackProtectorIndex())
LLVM_DEBUG(dbgs() << "Considering: " << MI);
unsigned idx = 0;
for (unsigned f = MI.getNumOperands(); idx != f; ++idx) {
if (!MI.getOperand(idx).isFI())
if (FrameIdx == MI.getOperand(idx).getIndex())
assert(idx < MI.getNumOperands() && "Cannot find FI operand");
int64_t Offset = 0;
int64_t FrameSizeAdjust = StackGrowsDown ? MFI.getLocalFrameSize() : 0;
LLVM_DEBUG(dbgs() << " Replacing FI in: " << MI);
// If we have a suitable base register available, use it; otherwise
// create a new one. Note that any offset encoded in the
// instruction itself will be taken into account by the target,
// so we don't have to adjust for it here when reusing a base
// register.
if (UsedBaseReg &&
lookupCandidateBaseReg(BaseReg, BaseOffset, FrameSizeAdjust,
LocalOffset, MI, TRI)) {
LLVM_DEBUG(dbgs() << " Reusing base register " << BaseReg << "\n");
// We found a register to reuse.
Offset = FrameSizeAdjust + LocalOffset - BaseOffset;
} else {
// No previously defined register was in range, so create a new one.
int64_t InstrOffset = TRI->getFrameIndexInstrOffset(&MI, idx);
int64_t PrevBaseOffset = BaseOffset;
BaseOffset = FrameSizeAdjust + LocalOffset + InstrOffset;
// We'd like to avoid creating single-use virtual base registers.
// Because the FrameRefs are in sorted order, and we've already
// processed all FrameRefs before this one, just check whether or not
// the next FrameRef will be able to reuse this new register. If not,
// then don't bother creating it.
if (ref + 1 >= e ||
BaseReg, BaseOffset, FrameSizeAdjust,
FrameReferenceInsns[ref + 1].getLocalOffset(),
*FrameReferenceInsns[ref + 1].getMachineInstr(), TRI)) {
BaseOffset = PrevBaseOffset;
const MachineFunction *MF = MI.getMF();
const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
LLVM_DEBUG(dbgs() << " Materializing base register " << BaseReg
<< " at frame local offset "
<< LocalOffset + InstrOffset << "\n");
// Tell the target to insert the instruction to initialize
// the base register.
// MachineBasicBlock::iterator InsertionPt = Entry->begin();
TRI->materializeFrameBaseRegister(Entry, BaseReg, FrameIdx,
// The base register already includes any offset specified
// by the instruction, so account for that so it doesn't get
// applied twice.
Offset = -InstrOffset;
UsedBaseReg = true;
assert(BaseReg != 0 && "Unable to allocate virtual base register!");
// Modify the instruction to use the new base register rather
// than the frame index operand.
TRI->resolveFrameIndex(MI, BaseReg, Offset);
LLVM_DEBUG(dbgs() << "Resolved: " << MI);
return UsedBaseReg;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f14b3dba4f31..ec384d2a7c56 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1,22146 +1,22158 @@
//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run
// both before and after the DAG is legalized.
// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
// primarily intended to handle simplification opportunities that are implicit
// in the LLVM IR and exposed by the various codegen lowering phases.
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/IntervalMap.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/DAGCombine.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <functional>
#include <iterator>
#include <string>
#include <tuple>
#include <utility>
using namespace llvm;
#define DEBUG_TYPE "dagcombine"
STATISTIC(NodesCombined , "Number of dag nodes combined");
STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
STATISTIC(SlicedLoads, "Number of load sliced");
STATISTIC(NumFPLogicOpsConv, "Number of logic ops converted to fp ops");
static cl::opt<bool>
CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
cl::desc("Enable DAG combiner's use of IR alias analysis"));
static cl::opt<bool>
UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
cl::desc("Enable DAG combiner's use of TBAA"));
#ifndef NDEBUG
static cl::opt<std::string>
CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
cl::desc("Only use DAG-combiner alias analysis in this"
" function"));
/// Hidden option to stress test load slicing, i.e., when this option
/// is enabled, load slicing bypasses most of its profitability guards.
static cl::opt<bool>
StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
cl::desc("Bypass the profitability model of load slicing"),
static cl::opt<bool>
MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
cl::desc("DAG combiner may split indexing from loads"));
static cl::opt<bool>
EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true),
cl::desc("DAG combiner enable merging multiple stores "
"into a wider store"));
static cl::opt<unsigned> TokenFactorInlineLimit(
"combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048),
cl::desc("Limit the number of operands to inline for Token Factors"));
static cl::opt<unsigned> StoreMergeDependenceLimit(
"combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10),
cl::desc("Limit the number of times for the same StoreNode and RootNode "
"to bail out in store merging dependence check"));
static cl::opt<bool> EnableReduceLoadOpStoreWidth(
"combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
cl::desc("DAG cominber enable reducing the width of load/op/store "
static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
"combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
cl::desc("DAG cominber enable load/<replace bytes>/store with "
"a narrower store"));
namespace {
class DAGCombiner {
SelectionDAG &DAG;
const TargetLowering &TLI;
const SelectionDAGTargetInfo *STI;
CombineLevel Level;
CodeGenOpt::Level OptLevel;
bool LegalDAG = false;
bool LegalOperations = false;
bool LegalTypes = false;
bool ForCodeSize;
bool DisableGenericCombines;
/// Worklist of all of the nodes that need to be simplified.
/// This must behave as a stack -- new nodes to process are pushed onto the
/// back and when processing we pop off of the back.
/// The worklist will not contain duplicates but may contain null entries
/// due to nodes being deleted from the underlying DAG.
SmallVector<SDNode *, 64> Worklist;
/// Mapping from an SDNode to its position on the worklist.
/// This is used to find and remove nodes from the worklist (by nulling
/// them) when they are deleted from the underlying DAG. It relies on
/// stable indices of nodes within the worklist.
DenseMap<SDNode *, unsigned> WorklistMap;
/// This records all nodes attempted to add to the worklist since we
/// considered a new worklist entry. As we keep do not add duplicate nodes
/// in the worklist, this is different from the tail of the worklist.
SmallSetVector<SDNode *, 32> PruningList;
/// Set of nodes which have been combined (at least once).
/// This is used to allow us to reliably add any operands of a DAG node
/// which have not yet been combined to the worklist.
SmallPtrSet<SDNode *, 32> CombinedNodes;
/// Map from candidate StoreNode to the pair of RootNode and count.
/// The count is used to track how many times we have seen the StoreNode
/// with the same RootNode bail out in dependence check. If we have seen
/// the bail out for the same pair many times over a limit, we won't
/// consider the StoreNode with the same RootNode as store merging
/// candidate again.
DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap;
// AA - Used for DAG load/store alias analysis.
AliasAnalysis *AA;
/// When an instruction is simplified, add all users of the instruction to
/// the work lists because they might get more simplified now.
void AddUsersToWorklist(SDNode *N) {
for (SDNode *Node : N->uses())
/// Convenient shorthand to add a node and all of its user to the worklist.
void AddToWorklistWithUsers(SDNode *N) {
// Prune potentially dangling nodes. This is called after
// any visit to a node, but should also be called during a visit after any
// failed combine which may have created a DAG node.
void clearAddedDanglingWorklistEntries() {
// Check any nodes added to the worklist to see if they are prunable.
while (!PruningList.empty()) {
auto *N = PruningList.pop_back_val();
if (N->use_empty())
SDNode *getNextWorklistEntry() {
// Before we do any work, remove nodes that are not in use.
SDNode *N = nullptr;
// The Worklist holds the SDNodes in order, but it may contain null
// entries.
while (!N && !Worklist.empty()) {
N = Worklist.pop_back_val();
if (N) {
bool GoodWorklistEntry = WorklistMap.erase(N);
assert(GoodWorklistEntry &&
"Found a worklist entry without a corresponding map entry!");
return N;
/// Call the node-specific routine that folds each particular type of node.
SDValue visit(SDNode *N);
DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
: DAG(D), TLI(D.getTargetLoweringInfo()),
Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) {
ForCodeSize = DAG.shouldOptForSize();
DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel);
MaximumLegalStoreInBits = 0;
// We use the minimum store size here, since that's all we can guarantee
// for the scalable vector types.
for (MVT VT : MVT::all_valuetypes())
if (EVT(VT).isSimple() && VT != MVT::Other &&
TLI.isTypeLegal(EVT(VT)) &&
VT.getSizeInBits().getKnownMinSize() >= MaximumLegalStoreInBits)
MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinSize();
void ConsiderForPruning(SDNode *N) {
// Mark this for potential pruning.
/// Add to the worklist making sure its instance is at the back (next to be
/// processed.)
void AddToWorklist(SDNode *N) {
assert(N->getOpcode() != ISD::DELETED_NODE &&
"Deleted Node added to Worklist");
// Skip handle nodes as they can't usefully be combined and confuse the
// zero-use deletion strategy.
if (N->getOpcode() == ISD::HANDLENODE)
if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
/// Remove all instances of N from the worklist.
void removeFromWorklist(SDNode *N) {
auto It = WorklistMap.find(N);
if (It == WorklistMap.end())
return; // Not in the worklist.
// Null out the entry rather than erasing it to avoid a linear operation.
Worklist[It->second] = nullptr;
void deleteAndRecombine(SDNode *N);
bool recursivelyDeleteUnusedNodes(SDNode *N);
/// Replaces all uses of the results of one DAG node with new values.
SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
bool AddTo = true);
/// Replaces all uses of the results of one DAG node with new values.
SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
return CombineTo(N, &Res, 1, AddTo);
/// Replaces all uses of the results of one DAG node with new values.
SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
bool AddTo = true) {
SDValue To[] = { Res0, Res1 };
return CombineTo(N, To, 2, AddTo);
void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
unsigned MaximumLegalStoreInBits;
/// Check the specified integer node value to see if it can be simplified or
/// if things it uses can be simplified by bit propagation.
/// If so, return true.
bool SimplifyDemandedBits(SDValue Op) {
unsigned BitWidth = Op.getScalarValueSizeInBits();
APInt DemandedBits = APInt::getAllOnesValue(BitWidth);
return SimplifyDemandedBits(Op, DemandedBits);
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) {
TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
KnownBits Known;
if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false))
return false;
// Revisit the node.
return true;
/// Check the specified vector node value to see if it can be simplified or
/// if things it uses can be simplified as it only uses some of the
/// elements. If so, return true.
bool SimplifyDemandedVectorElts(SDValue Op) {
// TODO: For now just pretend it cannot be simplified.
if (Op.getValueType().isScalableVector())
return false;
unsigned NumElts = Op.getValueType().getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
return SimplifyDemandedVectorElts(Op, DemandedElts);
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
const APInt &DemandedElts,
bool AssumeSingleUse = false);
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
bool AssumeSingleUse = false);
bool CombineToPreIndexedLoadStore(SDNode *N);
bool CombineToPostIndexedLoadStore(SDNode *N);
SDValue SplitIndexingFromLoad(LoadSDNode *LD);
bool SliceUpLoad(SDNode *N);
// Scalars have size 0 to distinguish from singleton vectors.
SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD);
bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val);
/// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
/// load.
/// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
/// \param InVecVT type of the input vector to EVE with bitcasts resolved.
/// \param EltNo index of the vector element to load.
/// \param OriginalLoad load that EVE came from to be replaced.
/// \returns EVE on success SDValue() on failure.
SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
SDValue EltNo,
LoadSDNode *OriginalLoad);
void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
SDValue PromoteIntBinOp(SDValue Op);
SDValue PromoteIntShiftOp(SDValue Op);
SDValue PromoteExtend(SDValue Op);
bool PromoteLoad(SDValue Op);
/// Call the node-specific routine that knows how to fold each
/// particular type of node. If that doesn't do anything, try the
/// target-specific DAG combines.
SDValue combine(SDNode *N);
// Visitation implementation - Implement dag node combining for different
// node types. The semantics are as follows:
// Return Value:
// SDValue.getNode() == 0 - No change was made
// SDValue.getNode() == N - N was replaced, is dead and has been handled.
// otherwise - N should be replaced by the returned Operand.
SDValue visitTokenFactor(SDNode *N);
SDValue visitMERGE_VALUES(SDNode *N);
SDValue visitADD(SDNode *N);
SDValue visitADDLike(SDNode *N);
SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference);
SDValue visitSUB(SDNode *N);
SDValue visitADDSAT(SDNode *N);
SDValue visitSUBSAT(SDNode *N);
SDValue visitADDC(SDNode *N);
SDValue visitADDO(SDNode *N);
SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
SDValue visitSUBC(SDNode *N);
SDValue visitSUBO(SDNode *N);
SDValue visitADDE(SDNode *N);
SDValue visitADDCARRY(SDNode *N);
SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
SDValue visitSUBE(SDNode *N);
SDValue visitSUBCARRY(SDNode *N);
SDValue visitMUL(SDNode *N);
SDValue visitMULFIX(SDNode *N);
SDValue useDivRem(SDNode *N);
SDValue visitSDIV(SDNode *N);
SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
SDValue visitUDIV(SDNode *N);
SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N);
SDValue visitREM(SDNode *N);
SDValue visitMULHU(SDNode *N);
SDValue visitMULHS(SDNode *N);
SDValue visitSMUL_LOHI(SDNode *N);
SDValue visitUMUL_LOHI(SDNode *N);
SDValue visitMULO(SDNode *N);
SDValue visitIMINMAX(SDNode *N);
SDValue visitAND(SDNode *N);
SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N);
SDValue visitOR(SDNode *N);
SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N);
SDValue visitXOR(SDNode *N);
SDValue SimplifyVBinOp(SDNode *N);
SDValue visitSHL(SDNode *N);
SDValue visitSRA(SDNode *N);
SDValue visitSRL(SDNode *N);
SDValue visitFunnelShift(SDNode *N);
SDValue visitRotate(SDNode *N);
SDValue visitABS(SDNode *N);
SDValue visitBSWAP(SDNode *N);
SDValue visitBITREVERSE(SDNode *N);
SDValue visitCTLZ(SDNode *N);
SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
SDValue visitCTTZ(SDNode *N);
SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
SDValue visitCTPOP(SDNode *N);
SDValue visitSELECT(SDNode *N);
SDValue visitVSELECT(SDNode *N);
SDValue visitSELECT_CC(SDNode *N);
SDValue visitSETCC(SDNode *N);
SDValue visitSETCCCARRY(SDNode *N);
SDValue visitSIGN_EXTEND(SDNode *N);
SDValue visitZERO_EXTEND(SDNode *N);
SDValue visitANY_EXTEND(SDNode *N);
SDValue visitAssertExt(SDNode *N);
SDValue visitAssertAlign(SDNode *N);
SDValue visitTRUNCATE(SDNode *N);
SDValue visitBITCAST(SDNode *N);
SDValue visitFREEZE(SDNode *N);
SDValue visitBUILD_PAIR(SDNode *N);
SDValue visitFADD(SDNode *N);
SDValue visitFSUB(SDNode *N);
SDValue visitFMUL(SDNode *N);
SDValue visitFMA(SDNode *N);
SDValue visitFDIV(SDNode *N);
SDValue visitFREM(SDNode *N);
SDValue visitFSQRT(SDNode *N);
SDValue visitFCOPYSIGN(SDNode *N);
SDValue visitFPOW(SDNode *N);
SDValue visitSINT_TO_FP(SDNode *N);
SDValue visitUINT_TO_FP(SDNode *N);
SDValue visitFP_TO_SINT(SDNode *N);
SDValue visitFP_TO_UINT(SDNode *N);
SDValue visitFP_ROUND(SDNode *N);
SDValue visitFP_EXTEND(SDNode *N);
SDValue visitFNEG(SDNode *N);
SDValue visitFABS(SDNode *N);
SDValue visitFCEIL(SDNode *N);
SDValue visitFTRUNC(SDNode *N);
SDValue visitFFLOOR(SDNode *N);
SDValue visitFMINNUM(SDNode *N);
SDValue visitFMAXNUM(SDNode *N);
SDValue visitFMINIMUM(SDNode *N);
SDValue visitFMAXIMUM(SDNode *N);
SDValue visitBRCOND(SDNode *N);
SDValue visitBR_CC(SDNode *N);
SDValue visitLOAD(SDNode *N);
SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
SDValue visitSTORE(SDNode *N);
SDValue visitLIFETIME_END(SDNode *N);
SDValue visitBUILD_VECTOR(SDNode *N);
SDValue visitMLOAD(SDNode *N);
SDValue visitMSTORE(SDNode *N);
SDValue visitMGATHER(SDNode *N);
SDValue visitMSCATTER(SDNode *N);
SDValue visitFP_TO_FP16(SDNode *N);
SDValue visitFP16_TO_FP(SDNode *N);
SDValue visitVECREDUCE(SDNode *N);
SDValue visitFADDForFMACombine(SDNode *N);
SDValue visitFSUBForFMACombine(SDNode *N);
SDValue visitFMULForFMADistributiveCombine(SDNode *N);
SDValue XformToShuffleWithZero(SDNode *N);
bool reassociationCanBreakAddressingModePattern(unsigned Opc,
const SDLoc &DL, SDValue N0,
SDValue N1);
SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
SDValue N1);
SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
SDValue N1, SDNodeFlags Flags);
SDValue visitShiftByConstant(SDNode *N);
SDValue foldSelectOfConstants(SDNode *N);
SDValue foldVSelectOfConstants(SDNode *N);
SDValue foldBinOpIntoSelect(SDNode *BO);
bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N);
SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
SDValue N2, SDValue N3, ISD::CondCode CC,
bool NotExtCompare = false);
SDValue convertSelectOfFPConstantsToLoadOffset(
const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
ISD::CondCode CC);
SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
SDValue N2, SDValue N3, ISD::CondCode CC);
SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
const SDLoc &DL);
SDValue unfoldMaskedMerge(SDNode *N);
SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
const SDLoc &DL, bool foldBooleans);
SDValue rebuildSetCC(SDValue N);
bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
SDValue &CC, bool MatchStrict = false) const;
bool isOneUseSetCC(SDValue N) const;
SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
unsigned HiOp);
SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
SDValue CombineExtLoad(SDNode *N);
SDValue CombineZExtLogicopShiftLoad(SDNode *N);
SDValue combineRepeatedFPDivisors(SDNode *N);
SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
SDValue BuildSDIV(SDNode *N);
SDValue BuildSDIVPow2(SDNode *N);
SDValue BuildUDIV(SDNode *N);
SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
SDNodeFlags Flags, bool Reciprocal);
SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
SDNodeFlags Flags, bool Reciprocal);
SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
bool DemandHighBits = true);
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
SDValue InnerPos, SDValue InnerNeg,
unsigned PosOpcode, unsigned NegOpcode,
const SDLoc &DL);
SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
SDValue InnerPos, SDValue InnerNeg,
unsigned PosOpcode, unsigned NegOpcode,
const SDLoc &DL);
SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
SDValue MatchLoadCombine(SDNode *N);
SDValue MatchStoreCombine(StoreSDNode *N);
SDValue ReduceLoadWidth(SDNode *N);
SDValue ReduceLoadOpStoreWidth(SDNode *N);
SDValue splitMergedValStore(StoreSDNode *ST);
SDValue TransformFPLoadStorePair(SDNode *N);
SDValue convertBuildVecZextToZext(SDNode *N);
SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
SDValue reduceBuildVecTruncToBitCast(SDNode *N);
SDValue reduceBuildVecToShuffle(SDNode *N);
SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
ArrayRef<int> VectorMask, SDValue VecIn1,
SDValue VecIn2, unsigned LeftIdx,
bool DidSplitVec);
SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
/// Walk up chain skipping non-aliasing memory nodes,
/// looking for aliasing nodes and adding them to the Aliases vector.
void GatherAllAliases(SDNode *N, SDValue OriginalChain,
SmallVectorImpl<SDValue> &Aliases);
/// Return true if there is any possibility that the two addresses overlap.
bool isAlias(SDNode *Op0, SDNode *Op1) const;
/// Walk up chain skipping non-aliasing memory nodes, looking for a better
/// chain (aliasing node.)
SDValue FindBetterChain(SDNode *N, SDValue Chain);
/// Try to replace a store and any possibly adjacent stores on
/// consecutive chains with better chains. Return true only if St is
/// replaced.
/// Notice that other chains may still be replaced even if the function
/// returns false.
bool findBetterNeighborChains(StoreSDNode *St);
// Helper for findBetterNeighborChains. Walk up store chain add additional
// chained stores that do not overlap and can be parallelized.
bool parallelizeChainedStores(StoreSDNode *St);
/// Holds a pointer to an LSBaseSDNode as well as information on where it
/// is located in a sequence of memory operations connected by a chain.
struct MemOpLink {
// Ptr to the mem node.
LSBaseSDNode *MemNode;
// Offset from the base ptr.
int64_t OffsetFromBase;
MemOpLink(LSBaseSDNode *N, int64_t Offset)
: MemNode(N), OffsetFromBase(Offset) {}
// Classify the origin of a stored value.
enum class StoreSource { Unknown, Constant, Extract, Load };
StoreSource getStoreSource(SDValue StoreVal) {
if (isa<ConstantSDNode>(StoreVal) || isa<ConstantFPSDNode>(StoreVal))
return StoreSource::Constant;
if (StoreVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
StoreVal.getOpcode() == ISD::EXTRACT_SUBVECTOR)
return StoreSource::Extract;
if (isa<LoadSDNode>(StoreVal))
return StoreSource::Load;
return StoreSource::Unknown;
/// This is a helper function for visitMUL to check the profitability
/// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
/// MulNode is the original multiply, AddNode is (add x, c1),
/// and ConstNode is c2.
bool isMulAddWithConstProfitable(SDNode *MulNode,
SDValue &AddNode,
SDValue &ConstNode);
/// This is a helper function for visitAND and visitZERO_EXTEND. Returns
/// true if the (and (load x) c) pattern matches an extload. ExtVT returns
/// the type of the loaded value to be extended.
bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
EVT LoadResultTy, EVT &ExtVT);
/// Helper function to calculate whether the given Load/Store can have its
/// width reduced to ExtVT.
bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType,
EVT &MemVT, unsigned ShAmt = 0);
/// Used by BackwardsPropagateMask to find suitable loads.
bool SearchForAndLoads(SDNode *N, SmallVectorImpl<LoadSDNode*> &Loads,
SmallPtrSetImpl<SDNode*> &NodesWithConsts,
ConstantSDNode *Mask, SDNode *&NodeToMask);
/// Attempt to propagate a given AND node back to load leaves so that they
/// can be combined into narrow loads.
bool BackwardsPropagateMask(SDNode *N);
/// Helper function for mergeConsecutiveStores which merges the component
/// store chains.
SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
unsigned NumStores);
/// This is a helper function for mergeConsecutiveStores. When the source
/// elements of the consecutive stores are all constants or all extracted
/// vector elements, try to merge them into one larger store introducing
/// bitcasts if necessary. \return True if a merged store was created.
bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
EVT MemVT, unsigned NumStores,
bool IsConstantSrc, bool UseVector,
bool UseTrunc);
/// This is a helper function for mergeConsecutiveStores. Stores that
/// potentially may be merged with St are placed in StoreNodes. RootNode is
/// a chain predecessor to all store candidates.
void getStoreMergeCandidates(StoreSDNode *St,
SmallVectorImpl<MemOpLink> &StoreNodes,
SDNode *&Root);
/// Helper function for mergeConsecutiveStores. Checks if candidate stores
/// have indirect dependency through their operands. RootNode is the
/// predecessor to all stores calculated by getStoreMergeCandidates and is
/// used to prune the dependency check. \return True if safe to merge.
bool checkMergeStoreCandidatesForDependencies(
SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
SDNode *RootNode);
/// This is a helper function for mergeConsecutiveStores. Given a list of
/// store candidates, find the first N that are consecutive in memory.
/// Returns 0 if there are not at least 2 consecutive stores to try merging.
unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
int64_t ElementSizeBytes) const;
/// This is a helper function for mergeConsecutiveStores. It is used for
/// store chains that are composed entirely of constant values.
bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes,
unsigned NumConsecutiveStores,
EVT MemVT, SDNode *Root, bool AllowVectors);
/// This is a helper function for mergeConsecutiveStores. It is used for
/// store chains that are composed entirely of extracted vector elements.
/// When extracting multiple vector elements, try to store them in one
/// vector store rather than a sequence of scalar stores.
bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes,
unsigned NumConsecutiveStores, EVT MemVT,
SDNode *Root);
/// This is a helper function for mergeConsecutiveStores. It is used for
/// store chains that are composed entirely of loaded values.
bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
unsigned NumConsecutiveStores, EVT MemVT,
SDNode *Root, bool AllowVectors,
bool IsNonTemporalStore, bool IsNonTemporalLoad);
/// Merge consecutive store operations into a wide store.
/// This optimization uses wide integers or vectors when possible.
/// \return true if stores were merged.
bool mergeConsecutiveStores(StoreSDNode *St);
/// Try to transform a truncation where C is a constant:
/// (trunc (and X, C)) -> (and (trunc X), (trunc C))
/// \p N needs to be a truncation and its first operand an AND. Other
/// requirements are checked by the function (e.g. that trunc is
/// single-use) and if missed an empty SDValue is returned.
SDValue distributeTruncateThroughAnd(SDNode *N);
/// Helper function to determine whether the target supports operation
/// given by \p Opcode for type \p VT, that is, whether the operation
/// is legal or custom before legalizing operations, and whether is
/// legal (but not custom) after legalization.
bool hasOperation(unsigned Opcode, EVT VT) {
if (LegalOperations)
return TLI.isOperationLegal(Opcode, VT);
return TLI.isOperationLegalOrCustom(Opcode, VT);
/// Runs the dag combiner on all nodes in the work list
void Run(CombineLevel AtLevel);
SelectionDAG &getDAG() const { return DAG; }
/// Returns a type large enough to hold any valid shift amount - before type
/// legalization these can be huge.
EVT getShiftAmountTy(EVT LHSTy) {
assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes);
/// This method returns true if we are running before type legalization or
/// if the specified VT is legal.
bool isTypeLegal(const EVT &VT) {
if (!LegalTypes) return true;
return TLI.isTypeLegal(VT);
/// Convenience wrapper around TargetLowering::getSetCCResultType
EVT getSetCCResultType(EVT VT) const {
return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
SDValue OrigLoad, SDValue ExtLoad,
ISD::NodeType ExtType);
/// This class is a DAGUpdateListener that removes any deleted
/// nodes from the worklist.
class WorklistRemover : public SelectionDAG::DAGUpdateListener {
DAGCombiner &DC;
explicit WorklistRemover(DAGCombiner &dc)
: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
void NodeDeleted(SDNode *N, SDNode *E) override {
class WorklistInserter : public SelectionDAG::DAGUpdateListener {
DAGCombiner &DC;
explicit WorklistInserter(DAGCombiner &dc)
: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
// FIXME: Ideally we could add N to the worklist, but this causes exponential
// compile time costs in large DAGs, e.g. Halide.
void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
} // end anonymous namespace
// TargetLowering::DAGCombinerInfo implementation
void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
SDValue TargetLowering::DAGCombinerInfo::
CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
SDValue TargetLowering::DAGCombinerInfo::
CombineTo(SDNode *N, SDValue Res, bool AddTo) {
return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
SDValue TargetLowering::DAGCombinerInfo::
CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
bool TargetLowering::DAGCombinerInfo::
recursivelyDeleteUnusedNodes(SDNode *N) {
return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N);
void TargetLowering::DAGCombinerInfo::
CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
// Helper Functions
void DAGCombiner::deleteAndRecombine(SDNode *N) {
// If the operands of this node are only used by the node, they will now be
// dead. Make sure to re-visit them and recursively delete dead nodes.
for (const SDValue &Op : N->ops())
// For an operand generating multiple values, one of the values may
// become dead allowing further simplification (e.g. split index
// arithmetic from an indexed load).
if (Op->hasOneUse() || Op->getNumValues() > 1)
// APInts must be the same size for most operations, this helper
// function zero extends the shorter of the pair so that they match.
// We provide an Offset so that we can create bitwidths that won't overflow.
static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
LHS = LHS.zextOrSelf(Bits);
RHS = RHS.zextOrSelf(Bits);
// Return true if this node is a setcc, or is a select_cc
// that selects between the target values used for true and false, making it
// equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
// the appropriate nodes based on the type of node we are checking. This
// simplifies life a bit for the callers.
bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
SDValue &CC, bool MatchStrict) const {
if (N.getOpcode() == ISD::SETCC) {
LHS = N.getOperand(0);
RHS = N.getOperand(1);
CC = N.getOperand(2);
return true;
if (MatchStrict &&
(N.getOpcode() == ISD::STRICT_FSETCC ||
N.getOpcode() == ISD::STRICT_FSETCCS)) {
LHS = N.getOperand(1);
RHS = N.getOperand(2);
CC = N.getOperand(3);
return true;
if (N.getOpcode() != ISD::SELECT_CC ||
!TLI.isConstTrueVal(N.getOperand(2).getNode()) ||
return false;
if (TLI.getBooleanContents(N.getValueType()) ==
return false;
LHS = N.getOperand(0);
RHS = N.getOperand(1);
CC = N.getOperand(4);
return true;
/// Return true if this is a SetCC-equivalent operation with only one use.
/// If this is true, it allows the users to invert the operation for free when
/// it is profitable to do so.
bool DAGCombiner::isOneUseSetCC(SDValue N) const {
SDValue N0, N1, N2;
if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
return true;
return false;
// Returns the SDNode if it is a constant float BuildVector
// or constant float.
static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
if (isa<ConstantFPSDNode>(N))
return N.getNode();
if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
return N.getNode();
return nullptr;
// Determines if it is a constant integer or a build vector of constant
// integers (and undefs).
// Do not permit build vector implicit truncation.
static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
return !(Const->isOpaque() && NoOpaques);
if (N.getOpcode() != ISD::BUILD_VECTOR)
return false;
unsigned BitWidth = N.getScalarValueSizeInBits();
for (const SDValue &Op : N->op_values()) {
if (Op.isUndef())
ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op);
if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth ||
(Const->isOpaque() && NoOpaques))
return false;
return true;
// Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
// undef's.
static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) {
if (V.getOpcode() != ISD::BUILD_VECTOR)
return false;
return isConstantOrConstantVector(V, NoOpaques) ||
// Determine if this an indexed load with an opaque target constant index.
static bool canSplitIdx(LoadSDNode *LD) {
return MaySplitLoadIndex &&
(LD->getOperand(2).getOpcode() != ISD::TargetConstant ||
bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
const SDLoc &DL,
SDValue N0,
SDValue N1) {
// Currently this only tries to ensure we don't undo the GEP splits done by
// CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this,
// we check if the following transformation would be problematic:
// (load/store (add, (add, x, offset1), offset2)) ->
// (load/store (add, x, offset1+offset2)).
if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD)
return false;
if (N0.hasOneUse())
return false;
auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
auto *C2 = dyn_cast<ConstantSDNode>(N1);
if (!C1 || !C2)
return false;
const APInt &C1APIntVal = C1->getAPIntValue();
const APInt &C2APIntVal = C2->getAPIntValue();
if (C1APIntVal.getBitWidth() > 64 || C2APIntVal.getBitWidth() > 64)
return false;
const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal;
if (CombinedValueIntVal.getBitWidth() > 64)
return false;
const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
for (SDNode *Node : N0->uses()) {
auto LoadStore = dyn_cast<MemSDNode>(Node);
if (LoadStore) {
// Is x[offset2] already not a legal addressing mode? If so then
// reassociating the constants breaks nothing (we test offset2 because
// that's the one we hope to fold into the load or store).
TargetLoweringBase::AddrMode AM;
AM.HasBaseReg = true;
AM.BaseOffs = C2APIntVal.getSExtValue();
EVT VT = LoadStore->getMemoryVT();
unsigned AS = LoadStore->getAddressSpace();
Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
// Would x[offset1+offset2] still be a legal addressing mode?
AM.BaseOffs = CombinedValue;
if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
return true;
return false;
// Helper for DAGCombiner::reassociateOps. Try to reassociate an expression
// such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc.
SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
SDValue N0, SDValue N1) {
EVT VT = N0.getValueType();
if (N0.getOpcode() != Opc)
return SDValue();
if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
if (SDValue OpNode =
DAG.FoldConstantArithmetic(Opc, DL, VT, {N0.getOperand(1), N1}))
return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
return SDValue();
if (N0.hasOneUse()) {
// Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
// iff (op x, c1) has one use
SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
if (!OpNode.getNode())
return SDValue();
return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
return SDValue();
// Try to reassociate commutative binops.
SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
SDValue N1, SDNodeFlags Flags) {
assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
// Floating-point reassociation is not allowed without loose FP math.
if (N0.getValueType().isFloatingPoint() ||
if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros())
return SDValue();
if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1))
return Combined;
if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0))
return Combined;
return SDValue();
SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
bool AddTo) {
assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
dbgs() << " and " << NumTo - 1 << " other values\n");
for (unsigned i = 0, e = NumTo; i != e; ++i)
assert((!To[i].getNode() ||
N->getValueType(i) == To[i].getValueType()) &&
"Cannot combine value to value of different type!");
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesWith(N, To);
if (AddTo) {
// Push the new nodes and any users onto the worklist
for (unsigned i = 0, e = NumTo; i != e; ++i) {
if (To[i].getNode()) {
// Finally, if the node is now dead, remove it from the graph. The node
// may not be dead if the replacement process recursively simplified to
// something else needing this node.
if (N->use_empty())
return SDValue(N, 0);
void DAGCombiner::
CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
// Replace the old value with the new one.
LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
dbgs() << '\n');
// Replace all uses. If any nodes become isomorphic to other nodes and
// are deleted, make sure to remove them from our worklist.
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
// Push the new node and any (possibly new) users onto the worklist.
// Finally, if the node is now dead, remove it from the graph. The node
// may not be dead if the replacement process recursively simplified to
// something else needing this node.
if (TLO.Old.getNode()->use_empty())
/// Check the specified integer node value to see if it can be simplified or if
/// things it uses can be simplified by bit propagation. If so, return true.
bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
const APInt &DemandedElts,
bool AssumeSingleUse) {
TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
KnownBits Known;
if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,
return false;
// Revisit the node.
return true;
/// Check the specified vector node value to see if it can be simplified or
/// if things it uses can be simplified as it only uses some of the elements.
/// If so, return true.
bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
const APInt &DemandedElts,
bool AssumeSingleUse) {
TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
APInt KnownUndef, KnownZero;
if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
TLO, 0, AssumeSingleUse))
return false;
// Revisit the node.
return true;
void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
SDLoc DL(Load);
EVT VT = Load->getValueType(0);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));
LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
Trunc.getNode()->dump(&DAG); dbgs() << '\n');
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
Replace = false;
SDLoc DL(Op);
if (ISD::isUNINDEXEDLoad(Op.getNode())) {
LoadSDNode *LD = cast<LoadSDNode>(Op);
EVT MemVT = LD->getMemoryVT();
ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
: LD->getExtensionType();
Replace = true;
return DAG.getExtLoad(ExtType, DL, PVT,
LD->getChain(), LD->getBasePtr(),
MemVT, LD->getMemOperand());
unsigned Opc = Op.getOpcode();
switch (Opc) {
default: break;
case ISD::AssertSext:
if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
case ISD::AssertZext:
if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
case ISD::Constant: {
unsigned ExtOpc =
Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
return DAG.getNode(ExtOpc, DL, PVT, Op);
if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
return SDValue();
return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op);
SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
return SDValue();
EVT OldVT = Op.getValueType();
SDLoc DL(Op);
bool Replace = false;
SDValue NewOp = PromoteOperand(Op, PVT, Replace);
if (!NewOp.getNode())
return SDValue();
if (Replace)
ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp,
SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
EVT OldVT = Op.getValueType();
SDLoc DL(Op);
bool Replace = false;
SDValue NewOp = PromoteOperand(Op, PVT, Replace);
if (!NewOp.getNode())
return SDValue();
if (Replace)
ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
return DAG.getZeroExtendInReg(NewOp, DL, OldVT);
/// Promote the specified integer binary operation if the target indicates it is
/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
/// i32 since i16 instructions are longer.
SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
if (!LegalOperations)
return SDValue();
EVT VT = Op.getValueType();
if (VT.isVector() || !VT.isInteger())
return SDValue();
// If operation type is 'undesirable', e.g. i16 on x86, consider
// promoting it.
unsigned Opc = Op.getOpcode();
if (TLI.isTypeDesirableForOp(Opc, VT))
return SDValue();
// Consult target whether it is a good idea to promote this operation and
// what's the right type to promote it to.
if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
assert(PVT != VT && "Don't know what type to promote to!");
LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
bool Replace0 = false;
SDValue N0 = Op.getOperand(0);
SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
bool Replace1 = false;
SDValue N1 = Op.getOperand(1);
SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
SDLoc DL(Op);
SDValue RV =
DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
// We are always replacing N0/N1's use in N and only need additional
// replacements if there are additional uses.
// Note: We are checking uses of the *nodes* (SDNode) rather than values
// (SDValue) here because the node may reference multiple values
// (for example, the chain value of a load node).
Replace0 &= !N0->hasOneUse();
Replace1 &= (N0 != N1) && !N1->hasOneUse();
// Combine Op here so it is preserved past replacements.
CombineTo(Op.getNode(), RV);
// If operands have a use ordering, make sure we deal with
// predecessor first.
if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
std::swap(N0, N1);
std::swap(NN0, NN1);
if (Replace0) {
ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
if (Replace1) {
ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
return Op;
return SDValue();
/// Promote the specified integer shift operation if the target indicates it is
/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
/// i32 since i16 instructions are longer.
SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
if (!LegalOperations)
return SDValue();
EVT VT = Op.getValueType();
if (VT.isVector() || !VT.isInteger())
return SDValue();
// If operation type is 'undesirable', e.g. i16 on x86, consider
// promoting it.
unsigned Opc = Op.getOpcode();
if (TLI.isTypeDesirableForOp(Opc, VT))
return SDValue();
// Consult target whether it is a good idea to promote this operation and
// what's the right type to promote it to.
if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
assert(PVT != VT && "Don't know what type to promote to!");
LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
bool Replace = false;
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
if (Opc == ISD::SRA)
N0 = SExtPromoteOperand(N0, PVT);
else if (Opc == ISD::SRL)
N0 = ZExtPromoteOperand(N0, PVT);
N0 = PromoteOperand(N0, PVT, Replace);
if (!N0.getNode())
return SDValue();
SDLoc DL(Op);
SDValue RV =
DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
if (Replace)
ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
// Deal with Op being deleted.
if (Op && Op.getOpcode() != ISD::DELETED_NODE)
return RV;
return SDValue();
SDValue DAGCombiner::PromoteExtend(SDValue Op) {
if (!LegalOperations)
return SDValue();
EVT VT = Op.getValueType();
if (VT.isVector() || !VT.isInteger())
return SDValue();
// If operation type is 'undesirable', e.g. i16 on x86, consider
// promoting it.
unsigned Opc = Op.getOpcode();
if (TLI.isTypeDesirableForOp(Opc, VT))
return SDValue();
// Consult target whether it is a good idea to promote this operation and
// what's the right type to promote it to.
if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
assert(PVT != VT && "Don't know what type to promote to!");
// fold (aext (aext x)) -> (aext x)
// fold (aext (zext x)) -> (zext x)
// fold (aext (sext x)) -> (sext x)
LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
return SDValue();
bool DAGCombiner::PromoteLoad(SDValue Op) {
if (!LegalOperations)
return false;
if (!ISD::isUNINDEXEDLoad(Op.getNode()))
return false;
EVT VT = Op.getValueType();
if (VT.isVector() || !VT.isInteger())
return false;
// If operation type is 'undesirable', e.g. i16 on x86, consider
// promoting it.
unsigned Opc = Op.getOpcode();
if (TLI.isTypeDesirableForOp(Opc, VT))
return false;
// Consult target whether it is a good idea to promote this operation and
// what's the right type to promote it to.
if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
assert(PVT != VT && "Don't know what type to promote to!");
SDLoc DL(Op);
SDNode *N = Op.getNode();
LoadSDNode *LD = cast<LoadSDNode>(N);
EVT MemVT = LD->getMemoryVT();
ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
: LD->getExtensionType();
SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
LD->getChain(), LD->getBasePtr(),
MemVT, LD->getMemOperand());
SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
Result.getNode()->dump(&DAG); dbgs() << '\n');
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
return true;
return false;
/// Recursively delete a node which has no uses and any operands for
/// which it is the only use.
/// Note that this both deletes the nodes and removes them from the worklist.
/// It also adds any nodes who have had a user deleted to the worklist as they
/// may now have only one use and subject to other combines.
bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
if (!N->use_empty())
return false;
SmallSetVector<SDNode *, 16> Nodes;
do {
N = Nodes.pop_back_val();
if (!N)
if (N->use_empty()) {
for (const SDValue &ChildN : N->op_values())
} else {
} while (!Nodes.empty());
return true;
// Main DAG Combiner implementation
void DAGCombiner::Run(CombineLevel AtLevel) {
// set the instance variables, so that the various visit routines may use it.
Level = AtLevel;
LegalDAG = Level >= AfterLegalizeDAG;
LegalOperations = Level >= AfterLegalizeVectorOps;
LegalTypes = Level >= AfterLegalizeTypes;
WorklistInserter AddNodes(*this);
// Add all the dag nodes to the worklist.
for (SDNode &Node : DAG.allnodes())
// Create a dummy node (which is not added to allnodes), that adds a reference
// to the root node, preventing it from being deleted, and tracking any
// changes of the root.
HandleSDNode Dummy(DAG.getRoot());
// While we have a valid worklist entry node, try to combine it.
while (SDNode *N = getNextWorklistEntry()) {
// If N has no uses, it is dead. Make sure to revisit all N's operands once
// N is deleted from the DAG, since they too may now be dead or may have a
// reduced number of uses, allowing other xforms.
if (recursivelyDeleteUnusedNodes(N))
WorklistRemover DeadNodes(*this);
// If this combine is running after legalizing the DAG, re-legalize any
// nodes pulled off the worklist.
if (LegalDAG) {
SmallSetVector<SDNode *, 16> UpdatedNodes;
bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
for (SDNode *LN : UpdatedNodes)
if (!NIsValid)
LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
// Add any operands of the new node which have not yet been combined to the
// worklist as well. Because the worklist uniques things already, this
// won't repeatedly process the same operand.
for (const SDValue &ChildN : N->op_values())
if (!CombinedNodes.count(ChildN.getNode()))
SDValue RV = combine(N);
if (!RV.getNode())
// If we get back the same node we passed in, rather than a new node or
// zero, we know that the node must have defined multiple values and
// CombineTo was used. Since CombineTo takes care of the worklist
// mechanics for us, we have no work to do in this case.
if (RV.getNode() == N)
assert(N->getOpcode() != ISD::DELETED_NODE &&
RV.getOpcode() != ISD::DELETED_NODE &&
"Node was deleted but visit returned new node!");
LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG));
if (N->getNumValues() == RV.getNode()->getNumValues())
DAG.ReplaceAllUsesWith(N, RV.getNode());
else {
assert(N->getValueType(0) == RV.getValueType() &&
N->getNumValues() == 1 && "Type mismatch");
DAG.ReplaceAllUsesWith(N, &RV);
// Push the new node and any users onto the worklist
// Finally, if the node is now dead, remove it from the graph. The node
// may not be dead if the replacement process recursively simplified to
// something else needing this node. This will also take care of adding any
// operands which have lost a user to the worklist.
// If the root changed (e.g. it was a dead load, update the root).
SDValue DAGCombiner::visit(SDNode *N) {
switch (N->getOpcode()) {
default: break;
case ISD::TokenFactor: return visitTokenFactor(N);
case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
case ISD::ADD: return visitADD(N);
case ISD::SUB: return visitSUB(N);
case ISD::UADDSAT: return visitADDSAT(N);
case ISD::USUBSAT: return visitSUBSAT(N);
case ISD::ADDC: return visitADDC(N);
case ISD::SADDO:
case ISD::UADDO: return visitADDO(N);
case ISD::SUBC: return visitSUBC(N);
case ISD::SSUBO:
case ISD::USUBO: return visitSUBO(N);
case ISD::ADDE: return visitADDE(N);
case ISD::ADDCARRY: return visitADDCARRY(N);
case ISD::SUBE: return visitSUBE(N);
case ISD::SUBCARRY: return visitSUBCARRY(N);
case ISD::UMULFIXSAT: return visitMULFIX(N);
case ISD::MUL: return visitMUL(N);
case ISD::SDIV: return visitSDIV(N);
case ISD::UDIV: return visitUDIV(N);
case ISD::SREM:
case ISD::UREM: return visitREM(N);
case ISD::MULHU: return visitMULHU(N);
case ISD::MULHS: return visitMULHS(N);
case ISD::SMUL_LOHI: return visitSMUL_LOHI(N);
case ISD::UMUL_LOHI: return visitUMUL_LOHI(N);
case ISD::SMULO:
case ISD::UMULO: return visitMULO(N);
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX: return visitIMINMAX(N);
case ISD::AND: return visitAND(N);
case ISD::OR: return visitOR(N);
case ISD::XOR: return visitXOR(N);
case ISD::SHL: return visitSHL(N);
case ISD::SRA: return visitSRA(N);
case ISD::SRL: return visitSRL(N);
case ISD::ROTR:
case ISD::ROTL: return visitRotate(N);
case ISD::FSHL:
case ISD::FSHR: return visitFunnelShift(N);
case ISD::ABS: return visitABS(N);
case ISD::BSWAP: return visitBSWAP(N);
case ISD::BITREVERSE: return visitBITREVERSE(N);
case ISD::CTLZ: return visitCTLZ(N);
case ISD::CTTZ: return visitCTTZ(N);
case ISD::CTPOP: return visitCTPOP(N);
case ISD::SELECT: return visitSELECT(N);
case ISD::VSELECT: return visitVSELECT(N);
case ISD::SELECT_CC: return visitSELECT_CC(N);
case ISD::SETCC: return visitSETCC(N);
case ISD::SETCCCARRY: return visitSETCCCARRY(N);
case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N);
case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N);
case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
case ISD::AssertSext:
case ISD::AssertZext: return visitAssertExt(N);
case ISD::AssertAlign: return visitAssertAlign(N);
case ISD::TRUNCATE: return visitTRUNCATE(N);
case ISD::BITCAST: return visitBITCAST(N);
case ISD::BUILD_PAIR: return visitBUILD_PAIR(N);
case ISD::FADD: return visitFADD(N);
case ISD::FSUB: return visitFSUB(N);
case ISD::FMUL: return visitFMUL(N);
case ISD::FMA: return visitFMA(N);
case ISD::FDIV: return visitFDIV(N);
case ISD::FREM: return visitFREM(N);
case ISD::FSQRT: return visitFSQRT(N);
case ISD::FCOPYSIGN: return visitFCOPYSIGN(N);
case ISD::FPOW: return visitFPOW(N);
case ISD::SINT_TO_FP: return visitSINT_TO_FP(N);
case ISD::UINT_TO_FP: return visitUINT_TO_FP(N);
case ISD::FP_TO_SINT: return visitFP_TO_SINT(N);
case ISD::FP_TO_UINT: return visitFP_TO_UINT(N);
case ISD::FP_ROUND: return visitFP_ROUND(N);
case ISD::FP_EXTEND: return visitFP_EXTEND(N);
case ISD::FNEG: return visitFNEG(N);
case ISD::FABS: return visitFABS(N);
case ISD::FFLOOR: return visitFFLOOR(N);
case ISD::FMINNUM: return visitFMINNUM(N);
case ISD::FMAXNUM: return visitFMAXNUM(N);
case ISD::FMINIMUM: return visitFMINIMUM(N);
case ISD::FMAXIMUM: return visitFMAXIMUM(N);
case ISD::FCEIL: return visitFCEIL(N);
case ISD::FTRUNC: return visitFTRUNC(N);
case ISD::BRCOND: return visitBRCOND(N);
case ISD::BR_CC: return visitBR_CC(N);
case ISD::LOAD: return visitLOAD(N);
case ISD::STORE: return visitSTORE(N);
case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N);
case ISD::MGATHER: return visitMGATHER(N);
case ISD::MLOAD: return visitMLOAD(N);
case ISD::MSCATTER: return visitMSCATTER(N);
case ISD::MSTORE: return visitMSTORE(N);
case ISD::LIFETIME_END: return visitLIFETIME_END(N);
case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
case ISD::FP16_TO_FP: return visitFP16_TO_FP(N);
case ISD::FREEZE: return visitFREEZE(N);
return SDValue();
SDValue DAGCombiner::combine(SDNode *N) {
SDValue RV;
if (!DisableGenericCombines)
RV = visit(N);
// If nothing happened, try a target-specific DAG combine.
if (!RV.getNode()) {
assert(N->getOpcode() != ISD::DELETED_NODE &&
"Node was deleted but visit returned NULL!");
if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
// Expose the DAG combiner to the target combiner impls.
DagCombineInfo(DAG, Level, false, this);
RV = TLI.PerformDAGCombine(N, DagCombineInfo);
// If nothing happened still, try promoting the operation.
if (!RV.getNode()) {
switch (N->getOpcode()) {
default: break;
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
RV = PromoteIntBinOp(SDValue(N, 0));
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
RV = PromoteIntShiftOp(SDValue(N, 0));
RV = PromoteExtend(SDValue(N, 0));
case ISD::LOAD:
if (PromoteLoad(SDValue(N, 0)))
RV = SDValue(N, 0);
// If N is a commutative binary node, try to eliminate it if the commuted
// version is already present in the DAG.
if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&
N->getNumValues() == 1) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Constant operands are canonicalized to RHS.
if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) {
SDValue Ops[] = {N1, N0};
SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
if (CSENode)
return SDValue(CSENode, 0);
return RV;
/// Given a node, return its input chain if it has one, otherwise return a null
/// sd operand.
static SDValue getInputChainForNode(SDNode *N) {
if (unsigned NumOps = N->getNumOperands()) {
if (N->getOperand(0).getValueType() == MVT::Other)
return N->getOperand(0);
if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
return N->getOperand(NumOps-1);
for (unsigned i = 1; i < NumOps-1; ++i)
if (N->getOperand(i).getValueType() == MVT::Other)
return N->getOperand(i);
return SDValue();
SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
// If N has two operands, where one has an input chain equal to the other,
// the 'other' chain is redundant.
if (N->getNumOperands() == 2) {
if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
return N->getOperand(0);
if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
return N->getOperand(1);
// Don't simplify token factors if optnone.
if (OptLevel == CodeGenOpt::None)
return SDValue();
// If the sole user is a token factor, we should make sure we have a
// chance to merge them together. This prevents TF chains from inhibiting
// optimizations.
if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor)
SmallVector<SDNode *, 8> TFs; // List of token factors to visit.
SmallVector<SDValue, 8> Ops; // Ops for replacing token factor.
SmallPtrSet<SDNode*, 16> SeenOps;
bool Changed = false; // If we should replace this token factor.
// Start out with this token factor.
// Iterate through token factors. The TFs grows when new token factors are
// encountered.
for (unsigned i = 0; i < TFs.size(); ++i) {
// Limit number of nodes to inline, to avoid quadratic compile times.
// We have to add the outstanding Token Factors to Ops, otherwise we might
// drop Ops from the resulting Token Factors.
if (Ops.size() > TokenFactorInlineLimit) {
for (unsigned j = i; j < TFs.size(); j++)
Ops.emplace_back(TFs[j], 0);
// Drop unprocessed Token Factors from TFs, so we do not add them to the
// combiner worklist later.
SDNode *TF = TFs[i];
// Check each of the operands.
for (const SDValue &Op : TF->op_values()) {
switch (Op.getOpcode()) {
case ISD::EntryToken:
// Entry tokens don't need to be added to the list. They are
// redundant.
Changed = true;
case ISD::TokenFactor:
if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) {
// Queue up for processing.
Changed = true;
// Only add if it isn't already in the list.
if (SeenOps.insert(Op.getNode()).second)
Changed = true;
// Re-visit inlined Token Factors, to clean them up in case they have been
// removed. Skip the first Token Factor, as this is the current node.
for (unsigned i = 1, e = TFs.size(); i < e; i++)
// Remove Nodes that are chained to another node in the list. Do so
// by walking up chains breath-first stopping when we've seen
// another operand. In general we must climb to the EntryNode, but we can exit
// early if we find all remaining work is associated with just one operand as
// no further pruning is possible.
// List of nodes to search through and original Ops from which they originate.
SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
SmallPtrSet<SDNode *, 16> SeenChains;
bool DidPruneOps = false;
unsigned NumLeftToConsider = 0;
for (const SDValue &Op : Ops) {
Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
// If this is an Op, we can remove the op from the list. Remark any
// search associated with it as from the current OpNumber.
if (SeenOps.count(Op) != 0) {
Changed = true;
DidPruneOps = true;
unsigned OrigOpNumber = 0;
while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
assert((OrigOpNumber != Ops.size()) &&
"expected to find TokenFactor Operand");
// Re-mark worklist from OrigOpNumber to OpNumber
for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
if (Worklist[i].second == OrigOpNumber) {
Worklist[i].second = OpNumber;
OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
OpWorkCount[OrigOpNumber] = 0;
// Add if it's a new chain
if (SeenChains.insert(Op).second) {
Worklist.push_back(std::make_pair(Op, OpNumber));
for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
// We need at least be consider at least 2 Ops to prune.
if (NumLeftToConsider <= 1)
auto CurNode = Worklist[i].first;
auto CurOpNumber = Worklist[i].second;
assert((OpWorkCount[CurOpNumber] > 0) &&
"Node should not appear in worklist");
switch (CurNode->getOpcode()) {
case ISD::EntryToken:
// Hitting EntryToken is the only way for the search to terminate without
// hitting
// another operand's search. Prevent us from marking this operand
// considered.
case ISD::TokenFactor:
for (const SDValue &Op : CurNode->op_values())
AddToWorklist(i, Op.getNode(), CurOpNumber);
case ISD::CopyFromReg:
case ISD::CopyToReg:
AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
if (OpWorkCount[CurOpNumber] == 0)
// If we've changed things around then replace token factor.
if (Changed) {
SDValue Result;
if (Ops.empty()) {
// The entry token is the only possible outcome.
Result = DAG.getEntryNode();
} else {
if (DidPruneOps) {
SmallVector<SDValue, 8> PrunedOps;
for (const SDValue &Op : Ops) {
if (SeenChains.count(Op.getNode()) == 0)
Result = DAG.getTokenFactor(SDLoc(N), PrunedOps);
} else {
Result = DAG.getTokenFactor(SDLoc(N), Ops);
return Result;
return SDValue();
/// MERGE_VALUES can always be eliminated.
SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
WorklistRemover DeadNodes(*this);
// Replacing results may cause a different MERGE_VALUES to suddenly
// be CSE'd with N, and carry its uses with it. Iterate until no
// uses remain, to ensure that the node can be safely deleted.
// First add the users of this node to the work list so that they
// can be tried again once they have new operands.
do {
// Do as a single replacement to avoid rewalking use lists.
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
} while (!N->use_empty());
return SDValue(N, 0); // Return N so it doesn't get rechecked!
/// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
/// ConstantSDNode pointer else nullptr.
static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 &&
"Unexpected binary operator");
// Don't do this unless the old select is going away. We want to eliminate the
// binary operator, not replace a binop with a select.
unsigned SelOpNo = 0;
SDValue Sel = BO->getOperand(0);
if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) {
SelOpNo = 1;
Sel = BO->getOperand(1);
if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
return SDValue();
SDValue CT = Sel.getOperand(1);
if (!isConstantOrConstantVector(CT, true) &&
return SDValue();
SDValue CF = Sel.getOperand(2);
if (!isConstantOrConstantVector(CF, true) &&
return SDValue();
// Bail out if any constants are opaque because we can't constant fold those.
// The exception is "and" and "or" with either 0 or -1 in which case we can
// propagate non constant operands into select. I.e.:
// and (select Cond, 0, -1), X --> select Cond, 0, X
// or X, (select Cond, -1, 0) --> select Cond, -1, X
auto BinOpcode = BO->getOpcode();
bool CanFoldNonConst =
(BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
(isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) &&
(isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF));
SDValue CBO = BO->getOperand(SelOpNo ^ 1);
if (!CanFoldNonConst &&
!isConstantOrConstantVector(CBO, true) &&
return SDValue();
EVT VT = Sel.getValueType();
// In case of shift value and shift amount may have different VT. For instance
// on x86 shift amount is i8 regardles of LHS type. Bail out if we have
// swapped operands and value types do not match. NB: x86 is fine if operands
// are not swapped with shift amount VT being not bigger than shifted value.
// TODO: that is possible to check for a shift operation, correct VTs and
// still perform optimization on x86 if needed.
if (SelOpNo && VT != CBO.getValueType())
return SDValue();
// We have a select-of-constants followed by a binary operator with a
// constant. Eliminate the binop by pulling the constant math into the select.
// Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
SDLoc DL(Sel);
SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT)
: DAG.getNode(BinOpcode, DL, VT, CT, CBO);
if (!CanFoldNonConst && !NewCT.isUndef() &&
!isConstantOrConstantVector(NewCT, true) &&
return SDValue();
SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF)
: DAG.getNode(BinOpcode, DL, VT, CF, CBO);
if (!CanFoldNonConst && !NewCF.isUndef() &&
!isConstantOrConstantVector(NewCF, true) &&
return SDValue();
SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
return SelectOp;
static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
"Expecting add or sub");
// Match a constant operand and a zext operand for the math instruction:
// add Z, C
// sub C, Z
bool IsAdd = N->getOpcode() == ISD::ADD;
SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0);
SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1);
auto *CN = dyn_cast<ConstantSDNode>(C);
if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND)
return SDValue();
// Match the zext operand as a setcc of a boolean.
if (Z.getOperand(0).getOpcode() != ISD::SETCC ||
Z.getOperand(0).getValueType() != MVT::i1)
return SDValue();
// Match the compare as: setcc (X & 1), 0, eq.
SDValue SetCC = Z.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) ||
SetCC.getOperand(0).getOpcode() != ISD::AND ||
return SDValue();
// We are adding/subtracting a constant and an inverted low bit. Turn that
// into a subtract/add of the low bit with incremented/decremented constant:
// add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1))
// sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1))
EVT VT = C.getValueType();
SDLoc DL(N);
SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT);
SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) :
DAG.getConstant(CN->getAPIntValue() - 1, DL, VT);
return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
/// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
/// a shift and add with a different constant.
static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
"Expecting add or sub");
// We need a constant operand for the add/sub, and the other operand is a
// logical shift right: add (srl), C or sub C, (srl).
bool IsAdd = N->getOpcode() == ISD::ADD;
SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);
SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);
if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) ||
ShiftOp.getOpcode() != ISD::SRL)
return SDValue();
// The shift must be of a 'not' value.
SDValue Not = ShiftOp.getOperand(0);
if (!Not.hasOneUse() || !isBitwiseNot(Not))
return SDValue();
// The shift must be moving the sign bit to the least-significant-bit.
EVT VT = ShiftOp.getValueType();
SDValue ShAmt = ShiftOp.getOperand(1);
ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
if (!ShAmtC || ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1))
return SDValue();
// Eliminate the 'not' by adjusting the shift and add/sub constant:
// add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
// sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
SDLoc DL(N);
auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL;
SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt);
if (SDValue NewC =
DAG.FoldConstantArithmetic(IsAdd ? ISD::ADD : ISD::SUB, DL, VT,
{ConstantOp, DAG.getConstant(1, DL, VT)}))
return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC);
return SDValue();
/// Try to fold a node that behaves like an ADD (note that N isn't necessarily
/// an ISD::ADD here, it could for example be an ISD::OR if we know that there
/// are no common bits set in the operands).
SDValue DAGCombiner::visitADDLike(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
SDLoc DL(N);
// fold vector ops
if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (add x, 0) -> x, vector edition
if (ISD::isBuildVectorAllZeros(N1.getNode()))
return N0;
if (ISD::isBuildVectorAllZeros(N0.getNode()))
return N1;
// fold (add x, undef) -> undef
if (N0.isUndef())
return N0;
if (N1.isUndef())
return N1;
if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
// canonicalize constant to RHS
if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
// fold (add c1, c2) -> c1+c2
return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1});
// fold (add x, 0) -> x
if (isNullConstant(N1))
return N0;
if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) {
// fold ((A-c1)+c2) -> (A+(c2-c1))
if (N0.getOpcode() == ISD::SUB &&
isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {
SDValue Sub =
DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N0.getOperand(1)});
assert(Sub && "Constant folding failed");
return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub);
// fold ((c1-A)+c2) -> (c1+c2)-A
if (N0.getOpcode() == ISD::SUB &&
isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
SDValue Add =
DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N0.getOperand(0)});
assert(Add && "Constant folding failed");
return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
// add (sext i1 X), 1 -> zext (not i1 X)
// We don't transform this pattern:
// add (zext i1 X), -1 -> sext (not i1 X)
// because most (?) targets generate better code for the zext form.
if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
isOneOrOneSplat(N1)) {
SDValue X = N0.getOperand(0);
if ((!LegalOperations ||
(TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
X.getScalarValueSizeInBits() == 1) {
SDValue Not = DAG.getNOT(DL, X, X.getValueType());
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
// Fold (add (or x, c0), c1) -> (add x, (c0 + c1)) if (or x, c0) is
// equivalent to (add x, c0).
if (N0.getOpcode() == ISD::OR &&
isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true) &&
DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
if (SDValue Add0 = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT,
{N1, N0.getOperand(1)}))
return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// reassociate add
if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N0, N1)) {
if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
return RADD;
// fold ((0-A) + B) -> B-A
if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0)))
return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
// fold (A + (0-B)) -> A-B
if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));
// fold (A+(B-A)) -> B
if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
return N1.getOperand(0);
// fold ((B-A)+A) -> B
if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
return N0.getOperand(0);
// fold ((A-B)+(C-A)) -> (C-B)
if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
N0.getOperand(0) == N1.getOperand(1))
return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
// fold ((A-B)+(B-C)) -> (A-C)
if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
N0.getOperand(1) == N1.getOperand(0))
return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
// fold (A+(B-(A+C))) to (B-C)
if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
N0 == N1.getOperand(1).getOperand(0))
return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
// fold (A+(B-(C+A))) to (B-C)
if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
N0 == N1.getOperand(1).getOperand(1))
return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
// fold (A+((B-A)+or-C)) to (B+or-C)
if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
N1.getOperand(0).getOpcode() == ISD::SUB &&
N0 == N1.getOperand(0).getOperand(1))
return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0),
// fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
SDValue N10 = N1.getOperand(0);
SDValue N11 = N1.getOperand(1);
if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10))
return DAG.getNode(ISD::SUB, DL, VT,
DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
// fold (add (umax X, C), -C) --> (usubsat X, C)
if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) {
auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
return (!Max && !Op) ||
(Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue()));
if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT,
/*AllowUndefs*/ true))
return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0),
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
if (isOneOrOneSplat(N1)) {
// fold (add (xor a, -1), 1) -> (sub 0, a)
if (isBitwiseNot(N0))
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
// fold (add (add (xor a, -1), b), 1) -> (sub b, a)
if (N0.getOpcode() == ISD::ADD ||
N0.getOpcode() == ISD::UADDO ||
N0.getOpcode() == ISD::SADDO) {
SDValue A, Xor;
if (isBitwiseNot(N0.getOperand(0))) {
A = N0.getOperand(1);
Xor = N0.getOperand(0);
} else if (isBitwiseNot(N0.getOperand(1))) {
A = N0.getOperand(0);
Xor = N0.getOperand(1);
if (Xor)
return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0));
// Look for:
// add (add x, y), 1
// And if the target does not like this form then turn into:
// sub y, (xor x, -1)
if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() &&
N0.getOpcode() == ISD::ADD) {
SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
DAG.getAllOnesConstant(DL, VT));
return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not);
// (x - y) + -1 -> add (xor y, -1), x
if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
isAllOnesOrAllOnesSplat(N1)) {
SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1);
return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
if (SDValue Combined = visitADDLikeCommutative(N0, N1, N))
return Combined;
if (SDValue Combined = visitADDLikeCommutative(N1, N0, N))
return Combined;
return SDValue();
SDValue DAGCombiner::visitADD(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
SDLoc DL(N);
if (SDValue Combined = visitADDLike(N))
return Combined;
if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
return V;
if (SDValue V = foldAddSubOfSignBit(N, DAG))
return V;
// fold (a+b) -> (a|b) iff a and b share no bits.
if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
DAG.haveNoCommonBitsSet(N0, N1))
return DAG.getNode(ISD::OR, DL, VT, N0, N1);
// Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) {
APInt C0 = N0->getConstantOperandAPInt(0);
APInt C1 = N1->getConstantOperandAPInt(0);
return DAG.getVScale(DL, VT, C0 + C1);
// fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2)
if ((N0.getOpcode() == ISD::ADD) &&
(N0.getOperand(1).getOpcode() == ISD::VSCALE) &&
(N1.getOpcode() == ISD::VSCALE)) {
auto VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);
auto VS1 = N1->getConstantOperandAPInt(0);
auto VS = DAG.getVScale(DL, VT, VS0 + VS1);
return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS);
return SDValue();
SDValue DAGCombiner::visitADDSAT(SDNode *N) {
unsigned Opcode = N->getOpcode();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
SDLoc DL(N);
// fold vector ops
if (VT.isVector()) {
// TODO SimplifyVBinOp
// fold (add_sat x, 0) -> x, vector edition
if (ISD::isBuildVectorAllZeros(N1.getNode()))
return N0;
if (ISD::isBuildVectorAllZeros(N0.getNode()))
return N1;
// fold (add_sat x, undef) -> -1
if (N0.isUndef() || N1.isUndef())
return DAG.getAllOnesConstant(DL, VT);
if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
// canonicalize constant to RHS
if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
return DAG.getNode(Opcode, DL, VT, N1, N0);
// fold (add_sat c1, c2) -> c3
return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1});
// fold (add_sat x, 0) -> x
if (isNullConstant(N1))
return N0;
// If it cannot overflow, transform into an add.
if (Opcode == ISD::UADDSAT)
if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
return SDValue();
static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
bool Masked = false;
// First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
while (true) {
if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) {
V = V.getOperand(0);
if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
Masked = true;
V = V.getOperand(0);
// If this is not a carry, return.
if (V.getResNo() != 1)
return SDValue();
if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY &&
V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
return SDValue();
EVT VT = V.getNode()->getValueType(0);
if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT))
return SDValue();
// If the result is masked, then no matter what kind of bool it is we can
// return. If it isn't, then we need to make sure the bool type is either 0 or
// 1 and not other values.
if (Masked ||
TLI.getBooleanContents(V.getValueType()) ==
return V;
return SDValue();
/// Given the operands of an add/sub operation, see if the 2nd operand is a
/// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert
/// the opcode and bypass the mask operation.
static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1,
SelectionDAG &DAG, const SDLoc &DL) {
if (N1.getOpcode() != ISD::AND || !isOneOrOneSplat(N1->getOperand(1)))
return SDValue();
EVT VT = N0.getValueType();
if (DAG.ComputeNumSignBits(N1.getOperand(0)) != VT.getScalarSizeInBits())
return SDValue();
// add N0, (and (AssertSext X, i1), 1) --> sub N0, X
// sub N0, (and (AssertSext X, i1), 1) --> add N0, X
return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N1.getOperand(0));
/// Helper for doing combines based on N0 and N1 being added to each other.
SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
SDNode *LocReference) {
EVT VT = N0.getValueType();
SDLoc DL(LocReference);
// fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
return DAG.getNode(ISD::SUB, DL, VT, N0,
DAG.getNode(ISD::SHL, DL, VT,
if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL))
return V;
// Look for:
// add (add x, 1), y
// And if the target does not like this form then turn into:
// sub y, (xor x, -1)
if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.hasOneUse() &&
N0.getOpcode() == ISD::ADD && isOneOrOneSplat(N0.getOperand(1))) {
SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
DAG.getAllOnesConstant(DL, VT));
return DAG.getNode(ISD::SUB, DL, VT, N1, Not);
// Hoist one-use subtraction by non-opaque constant:
// (x - C) + y -> (x + y) - C
// This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1);
return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
// Hoist one-use subtraction from non-opaque constant:
// (C - x) + y -> (y - x) + C
if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0));
// If the target's bool is represented as 0/1, prefer to make this 'sub 0/1'
// rather than 'add 0/-1' (the zext should get folded).
// add (sext i1 Y), X --> sub X, (zext i1 Y)
if (N0.getOpcode() == ISD::SIGN_EXTEND &&
N0.getOperand(0).getScalarValueSizeInBits() == 1 &&
TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) {
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
// add X, (sextinreg Y i1) -> sub X, (and Y 1)
if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
if (TN->getVT() == MVT::i1) {
SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
DAG.getConstant(1, DL, VT));
return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
// (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) &&
N1.getResNo() == 0)
return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
N0, N1.getOperand(0), N1.getOperand(2));
// (add X, Carry) -> (addcarry X, 0, Carry)
if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
if (SDValue Carry = getAsCarry(TLI, N1))
return DAG.getNode(ISD::ADDCARRY, DL,
DAG.getVTList(VT, Carry.getValueType()), N0,
DAG.getConstant(0, DL, VT), Carry);
return SDValue();
SDValue DAGCombiner::visitADDC(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
SDLoc DL(N);
// If the flag result is dead, turn this into an ADD.
if (!N->hasAnyUseOfValue(1))
return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
// canonicalize constant to RHS.
ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
if (N0C && !N1C)
return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);
// fold (addc x, 0) -> x + no carry out
if (isNullConstant(N1))
return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
DL, MVT::Glue));
// If it cannot overflow, transform into an add.
if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
return SDValue();
static SDValue flipBoolean(SDValue V, const SDLoc &DL,
SelectionDAG &DAG, const TargetLowering &TLI) {
EVT VT = V.getValueType();
SDValue Cst;
switch (TLI.getBooleanContents(VT)) {
case TargetLowering::ZeroOrOneBooleanContent:
case TargetLowering::UndefinedBooleanContent:
Cst = DAG.getConstant(1, DL, VT);
case TargetLowering::ZeroOrNegativeOneBooleanContent:
Cst = DAG.getAllOnesConstant(DL, VT);
return DAG.getNode(ISD::XOR, DL, VT, V, Cst);
* Flips a boolean if it is cheaper to compute. If the Force parameters is set,
* then the flip also occurs if computing the inverse is the same cost.
* This function returns an empty SDValue in case it cannot flip the boolean
* without increasing the cost of the computation. If you want to flip a boolean
* no matter what, use flipBoolean.
static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG,
const TargetLowering &TLI,
bool Force) {
if (Force && isa<ConstantSDNode>(V))
return flipBoolean(V, SDLoc(V), DAG, TLI);
if (V.getOpcode() != ISD::XOR)
return SDValue();
ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false);
if (!Const)
return SDValue();
EVT VT = V.getValueType();
bool IsFlip = false;
switch(TLI.getBooleanContents(VT)) {
case TargetLowering::ZeroOrOneBooleanContent:
IsFlip = Const->isOne();
case TargetLowering::ZeroOrNegativeOneBooleanContent:
IsFlip = Const->isAllOnesValue();
case TargetLowering::UndefinedBooleanContent:
IsFlip = (Const->getAPIntValue() & 0x01) == 1;
if (IsFlip)
return V.getOperand(0);
if (Force)
return flipBoolean(V, SDLoc(V), DAG, TLI);
return SDValue();
SDValue DAGCombiner::visitADDO(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
bool IsSigned = (ISD::SADDO == N->getOpcode());
EVT CarryVT = N->getValueType(1);
SDLoc DL(N);
// If the flag result is dead, turn this into an ADD.
if (!N->hasAnyUseOfValue(1))
return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
// canonicalize constant to RHS.
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
// fold (addo x, 0) -> x + no carry out
if (isNullOrNullSplat(N1))
return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
if (!IsSigned) {
// If it cannot overflow, transform into an add.
if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
DAG.getConstant(0, DL, CarryVT));
// fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry.
if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) {
SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
DAG.getConstant(0, DL, VT), N0.getOperand(0));
return CombineTo(N, Sub,
flipBoolean(Sub.getValue(1), DL, DAG, TLI));
if (SDValue Combined = visitUADDOLike(N0, N1, N))
return Combined;
if (SDValue Combined = visitUADDOLike(N1, N0, N))
return Combined;
return SDValue();
SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
EVT VT = N0.getValueType();
if (VT.isVector())
return SDValue();
// (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
// If Y + 1 cannot overflow.
if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) {
SDValue Y = N1.getOperand(0);
SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never)
return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y,
// (uaddo X, Carry) -> (addcarry X, 0, Carry)
if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
if (SDValue Carry = getAsCarry(TLI, N1))
return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
DAG.getConstant(0, SDLoc(N), VT), Carry);
return SDValue();
SDValue DAGCombiner::visitADDE(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CarryIn = N->getOperand(2);
// canonicalize constant to RHS
ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
if (N0C && !N1C)
return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
N1, N0, CarryIn);
// fold (adde x, y, false) -> (addc x, y)
if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);
return SDValue();
SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CarryIn = N->getOperand(2);
SDLoc DL(N);
// canonicalize constant to RHS
ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
if (N0C && !N1C)
return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
// fold (addcarry x, y, false) -> (uaddo x, y)
if (isNullConstant(CarryIn)) {
if (!LegalOperations ||
TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0)))
return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
// fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
if (isNullConstant(N0) && isNullConstant(N1)) {
EVT VT = N0.getValueType();
EVT CarryVT = CarryIn.getValueType();
SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
DAG.getConstant(1, DL, VT)),
DAG.getConstant(0, DL, CarryVT));
if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
return Combined;
if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N))
return Combined;
return SDValue();
* If we are facing some sort of diamond carry propapagtion pattern try to
* break it up to generate something like:
* (addcarry X, 0, (addcarry A, B, Z):Carry)
* The end result is usually an increase in operation required, but because the
* carry is now linearized, other tranforms can kick in and optimize the DAG.
* Patterns typically look something like
* (uaddo A, B)
* / \
* Carry Sum
* | \
* | (addcarry *, 0, Z)
* | /
* \ Carry
* | /
* (addcarry X, *, *)
* But numerous variation exist. Our goal is to identify A, B, X and Z and
* produce a combine with a single path for carry propagation.
static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
SDValue X, SDValue Carry0, SDValue Carry1,
SDNode *N) {
if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1)
return SDValue();
if (Carry1.getOpcode() != ISD::UADDO)
return SDValue();
SDValue Z;
* First look for a suitable Z. It will present itself in the form of
* (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true
if (Carry0.getOpcode() == ISD::ADDCARRY &&
isNullConstant(Carry0.getOperand(1))) {
Z = Carry0.getOperand(2);
} else if (Carry0.getOpcode() == ISD::UADDO &&
isOneConstant(Carry0.getOperand(1))) {
EVT VT = Combiner.getSetCCResultType(Carry0.getValueType());
Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT);
} else {
// We couldn't find a suitable Z.
return SDValue();
auto cancelDiamond = [&](SDValue A,SDValue B) {
SDLoc DL(N);
SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z);
return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X,
DAG.getConstant(0, DL, X.getValueType()),
* (uaddo A, B)
* |
* Sum
* |
* (addcarry *, 0, Z)
if (Carry0.getOperand(0) == Carry1.getValue(0)) {
return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1));
* (addcarry A, 0, Z)
* |
* Sum
* |
* (uaddo *, B)
if (Carry1.getOperand(0) == Carry0.getValue(0)) {
return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1));
if (Carry1.getOperand(1) == Carry0.getValue(0)) {
return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0));
return SDValue();
// If we are facing some sort of diamond carry/borrow in/out pattern try to
// match patterns like:
// (uaddo A, B) CarryIn
// | \ |
// | \ |
// PartialSum PartialCarryOutX /
// | | /
// | ____|____________/
// | / |
// (uaddo *, *) \________
// | \ \
// | \ |
// | PartialCarryOutY |
// | \ |
// | \ /
// AddCarrySum | ______/
// | /
// CarryOut = (or *, *)
// And generate ADDCARRY (or SUBCARRY) with two result values:
// {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn)
// Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
// a single path for carry/borrow out propagation:
static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
const TargetLowering &TLI, SDValue Carry0,
SDValue Carry1, SDNode *N) {
if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1)
return SDValue();
unsigned Opcode = Carry0.getOpcode();
if (Opcode != Carry1.getOpcode())
return SDValue();
if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
return SDValue();
// Canonicalize the add/sub of A and B as Carry0 and the add/sub of the
// carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in
// the above ASCII art.)
if (Carry1.getOperand(0) != Carry0.getValue(0) &&
Carry1.getOperand(1) != Carry0.getValue(0))
std::swap(Carry0, Carry1);
if (Carry1.getOperand(0) != Carry0.getValue(0) &&
Carry1.getOperand(1) != Carry0.getValue(0))
return SDValue();
// The carry in value must be on the righthand side for subtraction.
unsigned CarryInOperandNum =
Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0;
if (Opcode == ISD::USUBO && CarryInOperandNum != 1)
return SDValue();
SDValue CarryIn = Carry1.getOperand(CarryInOperandNum);
unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY;
if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType()))
return SDValue();
// Verify that the carry/borrow in is plausibly a carry/borrow bit.
// TODO: make getAsCarry() aware of how partial carries are merged.
if (CarryIn.getOpcode() != ISD::ZERO_EXTEND)
return SDValue();
CarryIn = CarryIn.getOperand(0);
if (CarryIn.getValueType() != MVT::i1)
return SDValue();
SDLoc DL(N);
SDValue Merged =
DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0),
Carry0.getOperand(1), CarryIn);
// Please note that because we have proven that the result of the UADDO/USUBO
// of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can
// therefore prove that if the first UADDO/USUBO overflows, the second
// UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the
// maximum value.
// 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry
// 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow)
// This is important because it means that OR and XOR can be used to merge
// carry flags; and that AND can return a constant zero.
// TODO: match other operations that can merge flags (ADD, etc)
DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0));
if (N->getOpcode() == ISD::AND)
return DAG.getConstant(0, DL, MVT::i1);
return Merged.getValue(1);
SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
SDNode *N) {
// fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry.
if (isBitwiseNot(N0))
if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) {
SDLoc DL(N);
SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1,
N0.getOperand(0), NotC);
return CombineTo(N, Sub,
flipBoolean(Sub.getValue(1), DL, DAG, TLI));
// Iff the flag result is dead:
// (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry)
// Don't do this if the Carry comes from the uaddo. It won't remove the uaddo
// or the dependency between the instructions.
if ((N0.getOpcode() == ISD::ADD ||
(N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 &&
N0.getValue(1) != CarryIn)) &&
isNullConstant(N1) && !N->hasAnyUseOfValue(1))
return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
N0.getOperand(0), N0.getOperand(1), CarryIn);
* When one of the addcarry argument is itself a carry, we may be facing
* a diamond carry propagation. In which case we try to transform the DAG
* to ensure linear carry propagation if that is possible.
if (auto Y = getAsCarry(TLI, N1)) {
// Because both are carries, Y and Z can be swapped.
if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N))
return R;
if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N))
return R;
return SDValue();
// Since it may not be valid to emit a fold to zero for vector initializers
// check if we can before folding.
static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
SelectionDAG &DAG, bool LegalOperations) {
if (!VT.isVector())
return DAG.getConstant(0, DL, VT);
if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
return DAG.getConstant(0, DL, VT);
return SDValue();
SDValue DAGCombiner::visitSUB(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
SDLoc DL(N);
// fold vector ops
if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (sub x, 0) -> x, vector edition
if (ISD::isBuildVectorAllZeros(N1.getNode()))
return N0;
// fold (sub x, x) -> 0
// FIXME: Refactor this and xor and other similar operations together.
if (N0 == N1)
return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
// fold (sub c1, c2) -> c3
if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))
return C;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
// fold (sub x, c) -> (add x, -c)
if (N1C) {
return DAG.getNode(ISD::ADD, DL, VT, N0,
DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
if (isNullOrNullSplat(N0)) {
unsigned BitWidth = VT.getScalarSizeInBits();
// Right-shifting everything out but the sign bit followed by negation is
// the same as flipping arithmetic/logical shift type without the negation:
// -(X >>u 31) -> (X >>s 31)
// -(X >>s 31) -> (X >>u 31)
if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) {
ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1));
if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) {
auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA;
if (!LegalOperations || TLI.isOperationLegal(NewSh, VT))
return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1));
// 0 - X --> 0 if the sub is NUW.
if (N->getFlags().hasNoUnsignedWrap())
return N0;
if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
// N1 is either 0 or the minimum signed value. If the sub is NSW, then
// N1 must be 0 because negating the minimum signed value is undefined.
if (N->getFlags().hasNoSignedWrap())
return N0;
// 0 - X --> X if X is 0 or the minimum signed value.
return N1;
// Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
if (isAllOnesOrAllOnesSplat(N0))
return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
// fold (A - (0-B)) -> A+B
if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));
// fold A-(A-B) -> B
if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
return N1.getOperand(1);
// fold (A+B)-A -> B
if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
return N0.getOperand(1);
// fold (A+B)-B -> A
if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
return N0.getOperand(0);
// fold (A+C1)-C2 -> A+(C1-C2)
if (N0.getOpcode() == ISD::ADD &&
isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
SDValue NewC =
DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(1), N1});
assert(NewC && "Constant folding failed");
return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);
// fold C2-(A+C1) -> (C2-C1)-A
if (N1.getOpcode() == ISD::ADD) {
SDValue N11 = N1.getOperand(1);
if (isConstantOrConstantVector(N0, /* NoOpaques */ true) &&
isConstantOrConstantVector(N11, /* NoOpaques */ true)) {
SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11});
assert(NewC && "Constant folding failed");
return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
// fold (A-C1)-C2 -> A-(C1+C2)
if (N0.getOpcode() == ISD::SUB &&
isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
SDValue NewC =
DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0.getOperand(1), N1});
assert(NewC && "Constant folding failed");
return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);
// fold (c1-A)-c2 -> (c1-c2)-A
if (N0.getOpcode() == ISD::SUB &&
isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) {
SDValue NewC =
DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(0), N1});
assert(NewC && "Constant folding failed");
return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
// fold ((A+(B+or-C))-B) -> A+or-C
if (N0.getOpcode() == ISD::ADD &&
(N0.getOperand(1).getOpcode() == ISD::SUB ||
N0.getOperand(1).getOpcode() == ISD::ADD) &&
N0.getOperand(1).getOperand(0) == N1)
return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
// fold ((A+(C+B))-B) -> A+C
if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
N0.getOperand(1).getOperand(1) == N1)
return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
// fold ((A-(B-C))-C) -> A-B
if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
N0.getOperand(1).getOperand(1) == N1)
return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
// fold (A-(B-C)) -> A+(C-B)
if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
return DAG.getNode(ISD::ADD, DL, VT, N0,
DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
// A - (A & B) -> A & (~B)
if (N1.getOpcode() == ISD::AND) {
SDValue A = N1.getOperand(0);
SDValue B = N1.getOperand(1);
if (A != N0)
std::swap(A, B);
if (A == N0 &&
(N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) {
SDValue InvB =
DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT));
return DAG.getNode(ISD::AND, DL, VT, A, InvB);
// fold (X - (-Y * Z)) -> (X + (Y * Z))
if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
if (N1.getOperand(0).getOpcode() == ISD::SUB &&
isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
if (N1.getOperand(1).getOpcode() == ISD::SUB &&
isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
// If either operand of a sub is undef, the result is undef
if (N0.isUndef())
return N0;
if (N1.isUndef())
return N1;
if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
return V;
if (SDValue V = foldAddSubOfSignBit(N, DAG))
return V;
if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
return V;
// (x - y) - 1 -> add (xor y, -1), x
if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && isOneOrOneSplat(N1)) {
SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
DAG.getAllOnesConstant(DL, VT));
return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
// Look for:
// sub y, (xor x, -1)
// And if the target does not like this form then turn into:
// add (add x, y), 1
if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) {
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0));
return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT));
// Hoist one-use addition by non-opaque constant:
// (x + C) - y -> (x - y) + C
if (N0.hasOneUse() && N0.getOpcode() == ISD::ADD &&
isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1));
// y - (x + C) -> (y - x) - C
if (N1.hasOneUse() && N1.getOpcode() == ISD::ADD &&
isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) {
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0));
return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1));
// (x - C) - y -> (x - y) - C
// This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1));
// (C - x) - y -> C - (x + y)
if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1);
return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add);
// If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1'
// rather than 'sub 0/1' (the sext should get folded).
// sub X, (zext i1 Y) --> add X, (sext i1 Y)
if (N1.getOpcode() == ISD::ZERO_EXTEND &&
N1.getOperand(0).getScalarValueSizeInBits() == 1 &&
TLI.getBooleanContents(VT) ==
TargetLowering::ZeroOrNegativeOneBooleanContent) {
SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0));
return DAG.getNode(ISD::ADD, DL, VT, N0, SExt);
// fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
SDValue S0 = N1.getOperand(0);
if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) {
unsigned OpSizeInBits = VT.getScalarSizeInBits();
if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
if (C->getAPIntValue() == (OpSizeInBits - 1))
return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
// If the relocation model supports it, consider symbol offsets.
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
// fold (sub Sym, c) -> Sym-c
if (N1C && GA->getOpcode() == ISD::GlobalAddress)
return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
GA->getOffset() -
// fold (sub Sym+c1, Sym+c2) -> c1-c2
if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
if (GA->getGlobal() == GB->getGlobal())
return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
DL, VT);
// sub X, (sextinreg Y i1) -> add X, (and Y 1)
if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
if (TN->getVT() == MVT::i1) {
SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
DAG.getConstant(1, DL, VT));
return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
// canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C))
if (N1.getOpcode() == ISD::VSCALE) {
APInt IntVal = N1.getConstantOperandAPInt(0);
return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
// Prefer an add for more folding potential and possibly better codegen:
// sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1)
if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {
SDValue ShAmt = N1.getOperand(1);
ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
if (ShAmtC &&
ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) {
SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt);
return DAG.getNode(ISD::ADD, DL, VT, N0, SRA);
if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
// (sub Carry, X) -> (addcarry (sub 0, X), 0, Carry)
if (SDValue Carry = getAsCarry(TLI, N0)) {
SDValue X = N1;
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X);
return DAG.getNode(ISD::ADDCARRY, DL,
DAG.getVTList(VT, Carry.getValueType()), NegX, Zero,
return SDValue();
SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
SDLoc DL(N);
// fold vector ops
if (VT.isVector()) {
// TODO SimplifyVBinOp
// fold (sub_sat x, 0) -> x, vector edition
if (ISD::isBuildVectorAllZeros(N1.getNode()))
return N0;
// fold (sub_sat x, undef) -> 0
if (N0.isUndef() || N1.isUndef())
return DAG.getConstant(0, DL, VT);
// fold (sub_sat x, x) -> 0
if (N0 == N1)
return DAG.getConstant(0, DL, VT);
// fold (sub_sat c1, c2) -> c3
if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1}))
return C;
// fold (sub_sat x, 0) -> x
if (isNullConstant(N1))
return N0;
return SDValue();
SDValue DAGCombiner::visitSUBC(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
SDLoc DL(N);
// If the flag result is dead, turn this into an SUB.
if (!N->hasAnyUseOfValue(1))
return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
// fold (subc x, x) -> 0 + no borrow
if (N0 == N1)
return CombineTo(N, DAG.getConstant(0, DL, VT),
DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
// fold (subc x, 0) -> x + no borrow
if (isNullConstant(N1))
return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
// Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
if (isAllOnesConstant(N0))
return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
return SDValue();
SDValue DAGCombiner::visitSUBO(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
bool IsSigned = (ISD::SSUBO == N->getOpcode());
EVT CarryVT = N->getValueType(1);
SDLoc DL(N);
// If the flag result is dead, turn this into an SUB.
if (!N->hasAnyUseOfValue(1))
return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
// fold (subo x, x) -> 0 + no borrow
if (N0 == N1)
return CombineTo(N, DAG.getConstant(0, DL, VT),
DAG.getConstant(0, DL, CarryVT));
ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
// fold (subox, c) -> (addo x, -c)
if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) {
return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0,
DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
// fold (subo x, 0) -> x + no borrow
if (isNullOrNullSplat(N1))
return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
// Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
if (!IsSigned && isAllOnesOrAllOnesSplat(N0))
return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
DAG.getConstant(0, DL, CarryVT));
return SDValue();
SDValue DAGCombiner::visitSUBE(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CarryIn = N->getOperand(2);
// fold (sube x, y, false) -> (subc x, y)
if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);
return SDValue();
SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CarryIn = N->getOperand(2);
// fold (subcarry x, y, false) -> (usubo x, y)
if (isNullConstant(CarryIn)) {
if (!LegalOperations ||
TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0)))
return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
return SDValue();
// Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
SDValue DAGCombiner::visitMULFIX(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue Scale = N->getOperand(2);
EVT VT = N0.getValueType();
// fold (mulfix x, undef, scale) -> 0
if (N0.isUndef() || N1.isUndef())
return DAG.getConstant(0, SDLoc(N), VT);
// Canonicalize constant to RHS (vector doesn't have to splat)
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale);
// fold (mulfix x, 0, scale) -> 0
if (isNullConstant(N1))
return DAG.getConstant(0, SDLoc(N), VT);
return SDValue();
SDValue DAGCombiner::visitMUL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
// fold (mul x, undef) -> 0
if (N0.isUndef() || N1.isUndef())
return DAG.getConstant(0, SDLoc(N), VT);
bool N1IsConst = false;
bool N1IsOpaqueConst = false;
APInt ConstValue1;
// fold vector ops
if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
assert((!N1IsConst ||
ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
"Splat APInt should be element width");
} else {
N1IsConst = isa<ConstantSDNode>(N1);
if (N1IsConst) {
ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
// fold (mul c1, c2) -> c1*c2
if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1}))
return C;
// canonicalize constant to RHS (vector doesn't have to splat)
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
// fold (mul x, 0) -> 0
if (N1IsConst && ConstValue1.isNullValue())
return N1;
// fold (mul x, 1) -> x
if (N1IsConst && ConstValue1.isOneValue())
return N0;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// fold (mul x, -1) -> 0-x
if (N1IsConst && ConstValue1.isAllOnesValue()) {
SDLoc DL(N);
return DAG.getNode(ISD::SUB, DL, VT,
DAG.getConstant(0, DL, VT), N0);
// fold (mul x, (1 << c)) -> x << c
if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
DAG.isKnownToBeAPowerOfTwo(N1) &&
(!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
SDLoc DL(N);
SDValue LogBase2 = BuildLogBase2(N1, DL);
EVT ShiftVT = getShiftAmountTy(N0.getValueType());
SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
// fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {
unsigned Log2Val = (-ConstValue1).logBase2();
SDLoc DL(N);
// FIXME: If the input is something that is easily negated (e.g. a
// single-use add), we should put the negate there.
return DAG.getNode(ISD::SUB, DL, VT,
DAG.getConstant(0, DL, VT),
DAG.getNode(ISD::SHL, DL, VT, N0,
DAG.getConstant(Log2Val, DL,
// Try to transform multiply-by-(power-of-2 +/- 1) into shift and add/sub.
// mul x, (2^N + 1) --> add (shl x, N), x
// mul x, (2^N - 1) --> sub (shl x, N), x
// Examples: x * 33 --> (x << 5) + x
// x * 15 --> (x << 4) - x
// x * -33 --> -((x << 5) + x)
// x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
// TODO: We could handle more general decomposition of any constant by
// having the target set a limit on number of ops and making a
// callback to determine that sequence (similar to sqrt expansion).
unsigned MathOp = ISD::DELETED_NODE;
APInt MulC = ConstValue1.abs();
if ((MulC - 1).isPowerOf2())
MathOp = ISD::ADD;
else if ((MulC + 1).isPowerOf2())
MathOp = ISD::SUB;
if (MathOp != ISD::DELETED_NODE) {
unsigned ShAmt =
MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2();
assert(ShAmt < VT.getScalarSizeInBits() &&
"multiply-by-constant generated out of bounds shift");
SDLoc DL(N);
SDValue Shl =
DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT));
SDValue R = DAG.getNode(MathOp, DL, VT, Shl, N0);
if (ConstValue1.isNegative())
R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R);
return R;
// (mul (shl X, c1), c2) -> (mul X, c2 << c1)
if (N0.getOpcode() == ISD::SHL &&
isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
if (isConstantOrConstantVector(C3))
return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
// Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
// use.
SDValue Sh(nullptr, 0), Y(nullptr, 0);
// Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)).
if (N0.getOpcode() == ISD::SHL &&
isConstantOrConstantVector(N0.getOperand(1)) &&
N0.getNode()->hasOneUse()) {
Sh = N0; Y = N1;
} else if (N1.getOpcode() == ISD::SHL &&
isConstantOrConstantVector(N1.getOperand(1)) &&
N1.getNode()->hasOneUse()) {
Sh = N1; Y = N0;
if (Sh.getNode()) {
SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y);
return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1));
// fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
N0.getOpcode() == ISD::ADD &&
DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
isMulAddWithConstProfitable(N, N0, N1))
return DAG.getNode(ISD::ADD, SDLoc(N), VT,
DAG.getNode(ISD::MUL, SDLoc(N0), VT,
N0.getOperand(0), N1),
DAG.getNode(ISD::MUL, SDLoc(N1), VT,
N0.getOperand(1), N1));
// Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).
if (N0.getOpcode() == ISD::VSCALE)
if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) {
APInt C0 = N0.getConstantOperandAPInt(0);
APInt C1 = NC1->getAPIntValue();
return DAG.getVScale(SDLoc(N), VT, C0 * C1);
// reassociate mul
if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))
return RMUL;
return SDValue();
/// Return true if divmod libcall is available.
static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
const TargetLowering &TLI) {
RTLIB::Libcall LC;
EVT NodeType = Node->getValueType(0);
if (!NodeType.isSimple())
return false;
switch (NodeType.getSimpleVT().SimpleTy) {
default: return false; // No libcall for vector types.
case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
return TLI.getLibcallName(LC) != nullptr;
/// Issue divrem if both quotient and remainder are needed.
SDValue DAGCombiner::useDivRem(SDNode *Node) {
if (Node->use_empty())
return SDValue(); // This is a dead node, leave it alone.
unsigned Opcode = Node->getOpcode();
bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
// DivMod lib calls can still work on non-legal types if using lib-calls.
EVT VT = Node->getValueType(0);
if (VT.isVector() || !VT.isInteger())
return SDValue();
if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
return SDValue();
// If DIVREM is going to get expanded into a libcall,
// but there is no libcall available, then don't combine.
if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
!isDivRemLibcallAvailable(Node, isSigned, TLI))
return SDValue();
// If div is legal, it's better to do the normal expansion
unsigned OtherOpcode = 0;
if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) {
OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
if (TLI.isOperationLegalOrCustom(Opcode, VT))
return SDValue();
} else {
OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
return SDValue();
SDValue Op0 = Node->getOperand(0);
SDValue Op1 = Node->getOperand(1);
SDValue combined;
for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
// Convert the other matching node(s), too;
// otherwise, the DIVREM may get target-legalized into something
// target-specific that we won't be able to recognize.
unsigned UserOpc = User->getOpcode();
if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) &&
User->getOperand(0) == Op0 &&
User->getOperand(1) == Op1) {
if (!combined) {
if (UserOpc == OtherOpcode) {
SDVTList VTs = DAG.getVTList(VT, VT);
combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
} else if (UserOpc == DivRemOpc) {
combined = SDValue(User, 0);
} else {
assert(UserOpc == Opcode);
if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV)
CombineTo(User, combined);
else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM)
CombineTo(User, combined.getValue(1));
return combined;
static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
unsigned Opc = N->getOpcode();
bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc);
ConstantSDNode *N1C = isConstOrConstSplat(N1);
// X / undef -> undef
// X % undef -> undef
// X / 0 -> undef
// X % 0 -> undef
// NOTE: This includes vectors where any divisor element is zero/undef.
if (DAG.isUndef(Opc, {N0, N1}))
return DAG.getUNDEF(VT);
// undef / X -> 0
// undef % X -> 0
if (N0.isUndef())
return DAG.getConstant(0, DL, VT);
// 0 / X -> 0
// 0 % X -> 0
ConstantSDNode *N0C = isConstOrConstSplat(N0);
if (N0C && N0C->isNullValue())
return N0;
// X / X -> 1
// X % X -> 0
if (N0 == N1)
return DAG.getConstant(IsDiv ? 1 : 0, DL, VT);
// X / 1 -> X
// X % 1 -> 0
// If this is a boolean op (single-bit element type), we can't have
// division-by-zero or remainder-by-zero, so assume the divisor is 1.
// TODO: Similarly, if we're zero-extending a boolean divisor, then assume
// it's a 1.
if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
return SDValue();
SDValue DAGCombiner::visitSDIV(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT CCVT = getSetCCResultType(VT);
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
SDLoc DL(N);
// fold (sdiv c1, c2) -> c1/c2
ConstantSDNode *N1C = isConstOrConstSplat(N1);
if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
return C;
// fold (sdiv X, -1) -> 0-X
if (N1C && N1C->isAllOnesValue())
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
// fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
if (N1C && N1C->getAPIntValue().isMinSignedValue())
return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
DAG.getConstant(1, DL, VT),
DAG.getConstant(0, DL, VT));
if (SDValue V = simplifyDivRem(N, DAG))
return V;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// If we know the sign bits of both operands are zero, strength reduce to a
// udiv instead. Handles (X&15) /s 4 -> X&15 >> 2
if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);
if (SDValue V = visitSDIVLike(N0, N1, N)) {
// If the corresponding remainder node exists, update its users with
// (Dividend - (Quotient * Divisor).
if (SDNode *RemNode = DAG.getNodeIfExists(ISD::SREM, N->getVTList(),
{ N0, N1 })) {
SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
CombineTo(RemNode, Sub);
return V;
// sdiv, srem -> sdivrem
// If the divisor is constant, then return DIVREM only if isIntDivCheap() is
// true. Otherwise, we break the simplification logic in visitREM().
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
if (SDValue DivRem = useDivRem(N))
return DivRem;
return SDValue();
SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
EVT CCVT = getSetCCResultType(VT);
unsigned BitWidth = VT.getScalarSizeInBits();
// Helper for determining whether a value is a power-2 constant scalar or a
// vector of such elements.
auto IsPowerOfTwo = [](ConstantSDNode *C) {
if (C->isNullValue() || C->isOpaque())
return false;
if (C->getAPIntValue().isPowerOf2())
return true;
if ((-C->getAPIntValue()).isPowerOf2())
return true;
return false;
// fold (sdiv X, pow2) -> simple ops after legalize
// FIXME: We check for the exact bit here because the generic lowering gives
// better results in that case. The target-specific lowering should learn how
// to handle exact sdivs efficiently.
if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) {
// Target-specific implementation of sdiv x, pow2.
if (SDValue Res = BuildSDIVPow2(N))
return Res;
// Create constants that are functions of the shift amount value.
EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy);
SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1);
C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy);
SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1);
if (!isConstantOrConstantVector(Inexact))
return SDValue();
// Splat the sign bit into the register
SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0,
DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy));
// Add (N0 < 0) ? abs2 - 1 : 0;
SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact);
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl);
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1);
// Special case: (sdiv X, 1) -> X
// Special Case: (sdiv X, -1) -> 0-X
SDValue One = DAG.getConstant(1, DL, VT);
SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ);
SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ);
SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes);
Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra);
// If dividing by a positive value, we're done. Otherwise, the result must
// be negated.
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra);
// FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding.
SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT);
SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra);
return Res;
// If integer divide is expensive and we satisfy the requirements, emit an
// alternate sequence. Targets may check function attributes for size/speed
// trade-offs.
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (isConstantOrConstantVector(N1) &&
!TLI.isIntDivCheap(N->getValueType(0), Attr))
if (SDValue Op = BuildSDIV(N))
return Op;
return SDValue();
SDValue DAGCombiner::visitUDIV(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT CCVT = getSetCCResultType(VT);
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
SDLoc DL(N);
// fold (udiv c1, c2) -> c1/c2
ConstantSDNode *N1C = isConstOrConstSplat(N1);
if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
return C;
// fold (udiv X, -1) -> select(X == -1, 1, 0)
if (N1C && N1C->getAPIntValue().isAllOnesValue())
return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
DAG.getConstant(1, DL, VT),
DAG.getConstant(0, DL, VT));
if (SDValue V = simplifyDivRem(N, DAG))
return V;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
if (SDValue V = visitUDIVLike(N0, N1, N)) {
// If the corresponding remainder node exists, update its users with
// (Dividend - (Quotient * Divisor).
if (SDNode *RemNode = DAG.getNodeIfExists(ISD::UREM, N->getVTList(),
{ N0, N1 })) {
SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
CombineTo(RemNode, Sub);
return V;
// sdiv, srem -> sdivrem
// If the divisor is constant, then return DIVREM only if isIntDivCheap() is
// true. Otherwise, we break the simplification logic in visitREM().
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
if (SDValue DivRem = useDivRem(N))
return DivRem;
return SDValue();
SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
// fold (udiv x, (1 << c)) -> x >>u c
if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
DAG.isKnownToBeAPowerOfTwo(N1)) {
SDValue LogBase2 = BuildLogBase2(N1, DL);
EVT ShiftVT = getShiftAmountTy(N0.getValueType());
SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
if (N1.getOpcode() == ISD::SHL) {
SDValue N10 = N1.getOperand(0);
if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) &&
DAG.isKnownToBeAPowerOfTwo(N10)) {
SDValue LogBase2 = BuildLogBase2(N10, DL);
EVT ADDVT = N1.getOperand(1).getValueType();
SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc);
return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
// fold (udiv x, c) -> alternate
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (isConstantOrConstantVector(N1) &&
!TLI.isIntDivCheap(N->getValueType(0), Attr))
if (SDValue Op = BuildUDIV(N))
return Op;
return SDValue();
// handles ISD::SREM and ISD::UREM
SDValue DAGCombiner::visitREM(SDNode *N) {
unsigned Opcode = N->getOpcode();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT CCVT = getSetCCResultType(VT);
bool isSigned = (Opcode == ISD::SREM);
SDLoc DL(N);
// fold (rem c1, c2) -> c1%c2
ConstantSDNode *N1C = isConstOrConstSplat(N1);
if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
return C;
// fold (urem X, -1) -> select(X == -1, 0, x)
if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue())
return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
DAG.getConstant(0, DL, VT), N0);
if (SDValue V = simplifyDivRem(N, DAG))
return V;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
if (isSigned) {
// If we know the sign bits of both operands are zero, strength reduce to a
// urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15
if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
} else {
SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
if (DAG.isKnownToBeAPowerOfTwo(N1)) {
// fold (urem x, pow2) -> (and x, pow2-1)
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
return DAG.getNode(ISD::AND, DL, VT, N0, Add);
if (N1.getOpcode() == ISD::SHL &&
DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
// fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
return DAG.getNode(ISD::AND, DL, VT, N0, Add);
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
// If X/C can be simplified by the division-by-constant logic, lower
// X%C to the equivalent of X-X/C*C.
// Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the
// speculative DIV must not cause a DIVREM conversion. We guard against this
// by skipping the simplification if isIntDivCheap(). When div is not cheap,
// combine will not return a DIVREM. Regardless, checking cheapness here
// makes sense since the simplification results in fatter code.
if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
SDValue OptimizedDiv =
isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
if (OptimizedDiv.getNode()) {
// If the equivalent Div node also exists, update its users.
unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(),
{ N0, N1 }))
CombineTo(DivNode, OptimizedDiv);
SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
return Sub;
// sdiv, srem -> sdivrem
if (SDValue DivRem = useDivRem(N))
return DivRem.getValue(1);
return SDValue();
SDValue DAGCombiner::visitMULHS(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
if (VT.isVector()) {
// fold (mulhs x, 0) -> 0
// do not return N0/N1, because undef node may exist.
if (ISD::isBuildVectorAllZeros(N0.getNode()) ||
return DAG.getConstant(0, DL, VT);
// fold (mulhs x, 0) -> 0
if (isNullConstant(N1))
return N1;
// fold (mulhs x, 1) -> (sra x, size(x)-1)
if (isOneConstant(N1))
return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL,
// fold (mulhs x, undef) -> 0
if (N0.isUndef() || N1.isUndef())
return DAG.getConstant(0, DL, VT);
// If the type twice as wide is legal, transform the mulhs to a wider multiply
// plus a shift.
if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
MVT Simple = VT.getSimpleVT();
unsigned SimpleSize = Simple.getSizeInBits();
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
DAG.getConstant(SimpleSize, DL,
return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
return SDValue();
SDValue DAGCombiner::visitMULHU(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
if (VT.isVector()) {
// fold (mulhu x, 0) -> 0
// do not return N0/N1, because undef node may exist.
if (ISD::isBuildVectorAllZeros(N0.getNode()) ||
return DAG.getConstant(0, DL, VT);
// fold (mulhu x, 0) -> 0
if (isNullConstant(N1))
return N1;
// fold (mulhu x, 1) -> 0
if (isOneConstant(N1))
return DAG.getConstant(0, DL, N0.getValueType());
// fold (mulhu x, undef) -> 0
if (N0.isUndef() || N1.isUndef())
return DAG.getConstant(0, DL, VT);
// fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) {
unsigned NumEltBits = VT.getScalarSizeInBits();
SDValue LogBase2 = BuildLogBase2(N1, DL);
SDValue SRLAmt = DAG.getNode(
ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2);
EVT ShiftVT = getShiftAmountTy(N0.getValueType());
SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT);
return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
// If the type twice as wide is legal, transform the mulhu to a wider multiply
// plus a shift.
if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
MVT Simple = VT.getSimpleVT();
unsigned SimpleSize = Simple.getSizeInBits();
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
DAG.getConstant(SimpleSize, DL,
return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
return SDValue();
/// Perform optimizations common to nodes that compute two values. LoOp and HiOp
/// give the opcodes for the two computations that are being performed. Return
/// true if a simplification was made.
SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
unsigned HiOp) {
// If the high half is not needed, just compute the low half.
bool HiExists = N->hasAnyUseOfValue(1);
if (!HiExists && (!LegalOperations ||
TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
return CombineTo(N, Res, Res);
// If the low half is not needed, just compute the high half.
bool LoExists = N->hasAnyUseOfValue(0);
if (!LoExists && (!LegalOperations ||
TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) {
SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
return CombineTo(N, Res, Res);
// If both halves are used, return as it is.
if (LoExists && HiExists)
return SDValue();
// If the two computed results can be simplified separately, separate them.
if (LoExists) {
SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
SDValue LoOpt = combine(Lo.getNode());
if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
(!LegalOperations ||
TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType())))
return CombineTo(N, LoOpt, LoOpt);
if (HiExists) {
SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
SDValue HiOpt = combine(Hi.getNode());
if (HiOpt.getNode() && HiOpt != Hi &&
(!LegalOperations ||
TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType())))
return CombineTo(N, HiOpt, HiOpt);
return SDValue();
SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
return Res;
EVT VT = N->getValueType(0);
SDLoc DL(N);
// If the type is twice as wide is legal, transform the mulhu to a wider
// multiply plus a shift.
if (VT.isSimple() && !VT.isVector()) {
MVT Simple = VT.getSimpleVT();
unsigned SimpleSize = Simple.getSizeInBits();
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0));
SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1));
Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
// Compute the high part as N1.
Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
DAG.getConstant(SimpleSize, DL,
Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
// Compute the low part as N0.
Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
return CombineTo(N, Lo, Hi);
return SDValue();
SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
return Res;
EVT VT = N->getValueType(0);
SDLoc DL(N);
// (umul_lohi N0, 0) -> (0, 0)
if (isNullConstant(N->getOperand(1))) {
SDValue Zero = DAG.getConstant(0, DL, VT);
return CombineTo(N, Zero, Zero);
// (umul_lohi N0, 1) -> (N0, 0)
if (isOneConstant(N->getOperand(1))) {
SDValue Zero = DAG.getConstant(0, DL, VT);
return CombineTo(N, N->getOperand(0), Zero);
// If the type is twice as wide is legal, transform the mulhu to a wider
// multiply plus a shift.
if (VT.isSimple() && !VT.isVector()) {
MVT Simple = VT.getSimpleVT();
unsigned SimpleSize = Simple.getSizeInBits();
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0));
SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1));
Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
// Compute the high part as N1.
Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
DAG.getConstant(SimpleSize, DL,
Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
// Compute the low part as N0.
Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
return CombineTo(N, Lo, Hi);
return SDValue();
SDValue DAGCombiner::visitMULO(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
bool IsSigned = (ISD::SMULO == N->getOpcode());
EVT CarryVT = N->getValueType(1);
SDLoc DL(N);
// canonicalize constant to RHS.
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
// fold (mulo x, 0) -> 0 + no carry out
if (isNullOrNullSplat(N1))
return CombineTo(N, DAG.getConstant(0, DL, VT),
DAG.getConstant(0, DL, CarryVT));
// (mulo x, 2) -> (addo x, x)
if (ConstantSDNode *C2 = isConstOrConstSplat(N1))
if (C2->getAPIntValue() == 2)
return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
N->getVTList(), N0, N0);
return SDValue();
SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
unsigned Opcode = N->getOpcode();
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold operation with constant operands.
if (SDValue C = DAG.FoldConstantArithmetic(Opcode, SDLoc(N), VT, {N0, N1}))
return C;
// canonicalize constant to RHS
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
// Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
// Only do this if the current op isn't legal and the flipped is.
if (!TLI.isOperationLegal(Opcode, VT) &&
(N0.isUndef() || DAG.SignBitIsZero(N0)) &&
(N1.isUndef() || DAG.SignBitIsZero(N1))) {
unsigned AltOpcode;
switch (Opcode) {
case ISD::SMIN: AltOpcode = ISD::UMIN; break;
case ISD::SMAX: AltOpcode = ISD::UMAX; break;
case ISD::UMIN: AltOpcode = ISD::SMIN; break;
case ISD::UMAX: AltOpcode = ISD::SMAX; break;
default: llvm_unreachable("Unknown MINMAX opcode");
if (TLI.isOperationLegal(AltOpcode, VT))
return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1);
return SDValue();
/// If this is a bitwise logic instruction and both operands have the same
/// opcode, try to sink the other opcode after the logic instruction.
SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
EVT VT = N0.getValueType();
unsigned LogicOpcode = N->getOpcode();
unsigned HandOpcode = N0.getOpcode();
assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR ||
LogicOpcode == ISD::XOR) && "Expected logic opcode");
assert(HandOpcode == N1.getOpcode() && "Bad input!");
// Bail early if none of these transforms apply.
if (N0.getNumOperands() == 0)
return SDValue();
// FIXME: We should check number of uses of the operands to not increase
// the instruction count for all transforms.
// Handle size-changing casts.
SDValue X = N0.getOperand(0);
SDValue Y = N1.getOperand(0);
EVT XVT = X.getValueType();
SDLoc DL(N);
if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND ||
HandOpcode == ISD::SIGN_EXTEND) {
// If both operands have other uses, this transform would create extra
// instructions without eliminating anything.
if (!N0.hasOneUse() && !N1.hasOneUse())
return SDValue();
// We need matching integer source types.
if (XVT != Y.getValueType())
return SDValue();
// Don't create an illegal op during or after legalization. Don't ever
// create an unsupported vector op.
if ((VT.isVector() || LegalOperations) &&
!TLI.isOperationLegalOrCustom(LogicOpcode, XVT))
return SDValue();
// Avoid infinite looping with PromoteIntBinOp.
// TODO: Should we apply desirable/legal constraints to all opcodes?
if (HandOpcode == ISD::ANY_EXTEND && LegalTypes &&
!TLI.isTypeDesirableForOp(LogicOpcode, XVT))
return SDValue();
// logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y)
SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
return DAG.getNode(HandOpcode, DL, VT, Logic);
// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
if (HandOpcode == ISD::TRUNCATE) {
// If both operands have other uses, this transform would create extra
// instructions without eliminating anything.
if (!N0.hasOneUse() && !N1.hasOneUse())
return SDValue();
// We need matching source types.
if (XVT != Y.getValueType())
return SDValue();
// Don't create an illegal op during or after legalization.
if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
return SDValue();
// Be extra careful sinking truncate. If it's free, there's no benefit in
// widening a binop. Also, don't create a logic op on an illegal type.
if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
return SDValue();
if (!TLI.isTypeLegal(XVT))
return SDValue();
SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
return DAG.getNode(HandOpcode, DL, VT, Logic);
// For binops SHL/SRL/SRA/AND:
// logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z
if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL ||
HandOpcode == ISD::SRA || HandOpcode == ISD::AND) &&
N0.getOperand(1) == N1.getOperand(1)) {
// If either operand has other uses, this transform is not an improvement.
if (!N0.hasOneUse() || !N1.hasOneUse())
return SDValue();
SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1));
// Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y)
if (HandOpcode == ISD::BSWAP) {
// If either operand has other uses, this transform is not an improvement.
if (!N0.hasOneUse() || !N1.hasOneUse())
return SDValue();
SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
return DAG.getNode(HandOpcode, DL, VT, Logic);
// Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
// Only perform this optimization up until type legalization, before
// LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
// adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
// we don't want to undo this promotion.
// We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
// on scalars.
if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) &&
Level <= AfterLegalizeTypes) {
// Input types must be integer and the same.
if (XVT.isInteger() && XVT == Y.getValueType() &&
!(VT.isVector() && TLI.isTypeLegal(VT) &&
!XVT.isVector() && !TLI.isTypeLegal(XVT))) {
SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
return DAG.getNode(HandOpcode, DL, VT, Logic);
// Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
// Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
// If both shuffles use the same mask, and both shuffle within a single
// vector, then it is worthwhile to move the swizzle after the operation.
// The type-legalizer generates this pattern when loading illegal
// vector types from memory. In many cases this allows additional shuffle
// optimizations.
// There are other cases where moving the shuffle after the xor/and/or
// is profitable even if shuffles don't perform a swizzle.
// If both shuffles use the same mask, and both shuffles have the same first
// or second operand, then it might still be profitable to move the shuffle
// after the xor/and/or operation.
if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
auto *SVN0 = cast<ShuffleVectorSDNode>(N0);
auto *SVN1 = cast<ShuffleVectorSDNode>(N1);
assert(X.getValueType() == Y.getValueType() &&
"Inputs to shuffles are not the same type");
// Check that both shuffles use the same mask. The masks are known to be of
// the same length because the result vector type is the same.
// Check also that shuffles have only one use to avoid introducing extra
// instructions.
if (!SVN0->hasOneUse() || !SVN1->hasOneUse() ||
return SDValue();
// Don't try to fold this node if it requires introducing a
// build vector of all zeros that might be illegal at this stage.
SDValue ShOp = N0.getOperand(1);
if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
// (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C)
if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
SDValue Logic = DAG.getNode(LogicOpcode, DL, VT,
N0.getOperand(0), N1.getOperand(0));
return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask());
// Don't try to fold this node if it requires introducing a
// build vector of all zeros that might be illegal at this stage.
ShOp = N0.getOperand(0);
if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
// (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B))
if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) {
SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1),
return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask());
return SDValue();
/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
const SDLoc &DL) {
SDValue LL, LR, RL, RR, N0CC, N1CC;
if (!isSetCCEquivalent(N0, LL, LR, N0CC) ||
!isSetCCEquivalent(N1, RL, RR, N1CC))
return SDValue();
assert(N0.getValueType() == N1.getValueType() &&
"Unexpected operand types for bitwise logic op");
assert(LL.getValueType() == LR.getValueType() &&
RL.getValueType() == RR.getValueType() &&
"Unexpected operand types for setcc");
// If we're here post-legalization or the logic op type is not i1, the logic
// op type must match a setcc result type. Also, all folds require new
// operations on the left and right operands, so those types must match.
EVT VT = N0.getValueType();
EVT OpVT = LL.getValueType();
if (LegalOperations || VT.getScalarType() != MVT::i1)
if (VT != getSetCCResultType(OpVT))
return SDValue();
if (OpVT != RL.getValueType())
return SDValue();
ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
bool IsInteger = OpVT.isInteger();
if (LR == RR && CC0 == CC1 && IsInteger) {
bool IsZero = isNullOrNullSplat(LR);
bool IsNeg1 = isAllOnesOrAllOnesSplat(LR);
// All bits clear?
bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
// All sign bits clear?
bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
// Any bits set?
bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
// Any sign bits set?
bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;
// (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0)
// (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
// (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0)
// (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0)
if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) {
SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
return DAG.getSetCC(DL, VT, Or, LR, CC1);
// All bits set?
bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
// All sign bits set?
bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
// Any bits clear?
bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
// Any sign bits clear?
bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;
// (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
// (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0)
// (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
// (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1)
if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) {
SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
return DAG.getSetCC(DL, VT, And, LR, CC1);
// TODO: What is the 'or' equivalent of this fold?
// (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 &&
IsInteger && CC0 == ISD::SETNE &&
((isNullConstant(LR) && isAllOnesConstant(RR)) ||
(isAllOnesConstant(LR) && isNullConstant(RR)))) {
SDValue One = DAG.getConstant(1, DL, OpVT);
SDValue Two = DAG.getConstant(2, DL, OpVT);
SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
// Try more general transforms if the predicates match and the only user of
// the compares is the 'and' or 'or'.
if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
N0.hasOneUse() && N1.hasOneUse()) {
// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
// or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) {
SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
SDValue Zero = DAG.getConstant(0, DL, OpVT);
return DAG.getSetCC(DL, VT, Or, Zero, CC1);
// Turn compare of constants whose difference is 1 bit into add+and+setcc.
// TODO - support non-uniform vector amounts.
if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) {
// Match a shared variable operand and 2 non-opaque constant operands.
ConstantSDNode *C0 = isConstOrConstSplat(LR);
ConstantSDNode *C1 = isConstOrConstSplat(RR);
if (LL == RL && C0 && C1 && !C0->isOpaque() && !C1->isOpaque()) {
// Canonicalize larger constant as C0.
if (C1->getAPIntValue().ugt(C0->getAPIntValue()))
std::swap(C0, C1);
// The difference of the constants must be a single bit.
const APInt &C0Val = C0->getAPIntValue();
const APInt &C1Val = C1->getAPIntValue();
if ((C0Val - C1Val).isPowerOf2()) {
// and/or (setcc X, C0, ne), (setcc X, C1, ne/eq) -->
// setcc ((add X, -C1), ~(C0 - C1)), 0, ne/eq
SDValue OffsetC = DAG.getConstant(-C1Val, DL, OpVT);
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LL, OffsetC);
SDValue MaskC = DAG.getConstant(~(C0Val - C1Val), DL, OpVT);
SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Add, MaskC);
SDValue Zero = DAG.getConstant(0, DL, OpVT);
return DAG.getSetCC(DL, VT, And, Zero, CC0);
// Canonicalize equivalent operands to LL == RL.
if (LL == RR && LR == RL) {
CC1 = ISD::getSetCCSwappedOperands(CC1);
std::swap(RL, RR);
// (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
// (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
if (LL == RL && LR == RR) {
ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT)
: ISD::getSetCCOrOperation(CC0, CC1, OpVT);
(!LegalOperations ||
(TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
TLI.isOperationLegal(ISD::SETCC, OpVT))))
return DAG.getSetCC(DL, VT, LL, LR, NewCC);
return SDValue();
/// This contains all DAGCombine rules which reduce two values combined by
/// an And operation to a single value. This makes them reusable in the context
/// of visitSELECT(). Rules involving constants are not included as
/// visitSELECT() already handles those cases.
SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
EVT VT = N1.getValueType();
SDLoc DL(N);
// fold (and x, undef) -> 0
if (N0.isUndef() || N1.isUndef())
return DAG.getConstant(0, DL, VT);
if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
return V;
if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
VT.getSizeInBits() <= 64) {
if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
// Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
// immediate for an add, but it is legal if its top c2 bits are set,
// transform the ADD so the immediate doesn't need to be materialized
// in a register.
APInt ADDC = ADDI->getAPIntValue();
APInt SRLC = SRLI->getAPIntValue();
if (ADDC.getMinSignedBits() <= 64 &&
SRLC.ult(VT.getSizeInBits()) &&
!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
ADDC |= Mask;
if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
SDLoc DL0(N0);
SDValue NewAdd =
DAG.getNode(ISD::ADD, DL0, VT,
N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
CombineTo(N0.getNode(), NewAdd);
// Return N so it doesn't get rechecked!
return SDValue(N, 0);
// Reduce bit extract of low half of an integer to the narrower type.
// (and (srl i64:x, K), KMask) ->
// (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
unsigned Size = VT.getSizeInBits();
const APInt &AndMask = CAnd->getAPIntValue();
unsigned ShiftBits = CShift->getZExtValue();
// Bail out, this node will probably disappear anyway.
if (ShiftBits == 0)
return SDValue();
unsigned MaskBits = AndMask.countTrailingOnes();
EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
if (AndMask.isMask() &&
// Required bits must not span the two halves of the integer and
// must fit in the half size type.
(ShiftBits + MaskBits <= Size / 2) &&
TLI.isNarrowingProfitable(VT, HalfVT) &&
TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
TLI.isTruncateFree(VT, HalfVT) &&
TLI.isZExtFree(HalfVT, VT)) {
// The isNarrowingProfitable is to avoid regressions on PPC and
// AArch64 which match a few 64-bit bit insert / bit extract patterns
// on downstream users of this. Those patterns could probably be
// extended to handle extensions mixed in.
SDValue SL(N0);
assert(MaskBits <= Size);
// Extracting the highest bit of the low half.
EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
return SDValue();
bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
EVT LoadResultTy, EVT &ExtVT) {
if (!AndC->getAPIntValue().isMask())
return false;
unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
EVT LoadedVT = LoadN->getMemoryVT();
if (ExtVT == LoadedVT &&
(!LegalOperations ||
TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
// ZEXTLOAD will match without needing to change the size of the value being
// loaded.
return true;
// Do not change the width of a volatile or atomic loads.
if (!LoadN->isSimple())
return false;
// Do not generate loads of non-round integer types since these can
// be expensive (and would be wrong if the type is not byte sized).
if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound())
return false;
if (LegalOperations &&
!TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
return false;
if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
return false;
return true;
bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
ISD::LoadExtType ExtType, EVT &MemVT,
unsigned ShAmt) {
if (!LDST)
return false;
// Only allow byte offsets.
if (ShAmt % 8)
return false;
// Do not generate loads of non-round integer types since these can
// be expensive (and would be wrong if the type is not byte sized).
if (!MemVT.isRound())
return false;
// Don't change the width of a volatile or atomic loads.
if (!LDST->isSimple())
return false;
// Verify that we are actually reducing a load width here.
if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits())
return false;
// Ensure that this isn't going to produce an unsupported memory access.
if (ShAmt) {
assert(ShAmt % 8 == 0 && "ShAmt is byte offset");
const unsigned ByteShAmt = ShAmt / 8;
const Align LDSTAlign = LDST->getAlign();
const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
LDST->getAddressSpace(), NarrowAlign,
return false;
// It's not possible to generate a constant of extended or untyped type.
EVT PtrType = LDST->getBasePtr().getValueType();
if (PtrType == MVT::Untyped || PtrType.isExtended())
return false;
if (isa<LoadSDNode>(LDST)) {
LoadSDNode *Load = cast<LoadSDNode>(LDST);
// Don't transform one with multiple uses, this would require adding a new
// load.
if (!SDValue(Load, 0).hasOneUse())
return false;
if (LegalOperations &&
!TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT))
return false;
// For the transform to be legal, the load must produce only two values
// (the value loaded and the chain). Don't transform a pre-increment
// load, for example, which produces an extra value. Otherwise the
// transformation is not equivalent, and the downstream logic to replace
// uses gets things wrong.
if (Load->getNumValues() > 2)
return false;
// If the load that we're shrinking is an extload and we're not just
// discarding the extension we can't simply shrink the load. Bail.
// TODO: It would be possible to merge the extensions in some cases.
if (Load->getExtensionType() != ISD::NON_EXTLOAD &&
Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
return false;
if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT))
return false;
} else {
assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode");
StoreSDNode *Store = cast<StoreSDNode>(LDST);
// Can't write outside the original store
if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
return false;
if (LegalOperations &&
!TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT))
return false;
return true;
bool DAGCombiner::SearchForAndLoads(SDNode *N,
SmallVectorImpl<LoadSDNode*> &Loads,
SmallPtrSetImpl<SDNode*> &NodesWithConsts,
ConstantSDNode *Mask,
SDNode *&NodeToMask) {
// Recursively search for the operands, looking for loads which can be
// narrowed.
for (SDValue Op : N->op_values()) {
if (Op.getValueType().isVector())
return false;
// Some constants may need fixing up later if they are too large.
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) &&
(Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
if (!Op.hasOneUse())
return false;
switch(Op.getOpcode()) {
case ISD::LOAD: {
auto *Load = cast<LoadSDNode>(Op);
if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) {
// ZEXTLOAD is already small enough.
if (Load->getExtensionType() == ISD::ZEXTLOAD &&
// Use LE to convert equal sized loads to zext.
if (ExtVT.bitsLE(Load->getMemoryVT()))
return false;
case ISD::AssertZext: {
unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
EVT VT = Op.getOpcode() == ISD::AssertZext ?
cast<VTSDNode>(Op.getOperand(1))->getVT() :
// We can accept extending nodes if the mask is wider or an equal
// width to the original type.
if (ExtVT.bitsGE(VT))
case ISD::OR:
case ISD::XOR:
case ISD::AND:
if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
return false;
// Allow one node which will masked along with any loads found.
if (NodeToMask)
return false;
// Also ensure that the node to be masked only produces one data result.
NodeToMask = Op.getNode();
if (NodeToMask->getNumValues() > 1) {
bool HasValue = false;
for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) {
MVT VT = SDValue(NodeToMask, i).getSimpleValueType();
if (VT != MVT::Glue && VT != MVT::Other) {
if (HasValue) {
NodeToMask = nullptr;
return false;
HasValue = true;
assert(HasValue && "Node to be masked has no data result?");
return true;
bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!Mask)
return false;
if (!Mask->getAPIntValue().isMask())
return false;
// No need to do anything if the and directly uses a load.
if (isa<LoadSDNode>(N->getOperand(0)))
return false;
SmallVector<LoadSDNode*, 8> Loads;
SmallPtrSet<SDNode*, 2> NodesWithConsts;
SDNode *FixupNode = nullptr;
if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
if (Loads.size() == 0)
return false;
LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
SDValue MaskOp = N->getOperand(1);
// If it exists, fixup the single node we allow in the tree that needs
// masking.
if (FixupNode) {
LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
SDValue(FixupNode, 0), MaskOp);
DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
if (And.getOpcode() == ISD ::AND)
DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
// Narrow any constants that need it.
for (auto *LogicN : NodesWithConsts) {
SDValue Op0 = LogicN->getOperand(0);
SDValue Op1 = LogicN->getOperand(1);
if (isa<ConstantSDNode>(Op0))
std::swap(Op0, Op1);
SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
Op1, MaskOp);
DAG.UpdateNodeOperands(LogicN, Op0, And);
// Create narrow loads.
for (auto *Load : Loads) {
LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
SDValue(Load, 0), MaskOp);
DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
if (And.getOpcode() == ISD ::AND)
And = SDValue(
DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
SDValue NewLoad = ReduceLoadWidth(And.getNode());
assert(NewLoad &&
"Shouldn't be masking the load if it can't be narrowed");
CombineTo(Load, NewLoad, NewLoad.getValue(1));
DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
return true;
return false;
// Unfold
// x & (-1 'logical shift' y)
// To
// (x 'opposite logical shift' y) 'logical shift' y
// if it is better for performance.
SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
assert(N->getOpcode() == ISD::AND);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Do we actually prefer shifts over mask?
if (!TLI.shouldFoldMaskToVariableShiftPair(N0))
return SDValue();
// Try to match (-1 '[outer] logical shift' y)
unsigned OuterShift;
unsigned InnerShift; // The opposite direction to the OuterShift.
SDValue Y; // Shift amount.
auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool {
if (!M.hasOneUse())
return false;
OuterShift = M->getOpcode();
if (OuterShift == ISD::SHL)
InnerShift = ISD::SRL;
else if (OuterShift == ISD::SRL)
InnerShift = ISD::SHL;
return false;
if (!isAllOnesConstant(M->getOperand(0)))
return false;
Y = M->getOperand(1);
return true;
SDValue X;
if (matchMask(N1))
X = N0;
else if (matchMask(N0))
X = N1;
return SDValue();
SDLoc DL(N);
EVT VT = N->getValueType(0);
// tmp = x 'opposite logical shift' y
SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y);
// ret = tmp 'logical shift' y
SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y);
return T1;
/// Try to replace shift/logic that tests if a bit is clear with mask + setcc.
/// For a target with a bit test, this is expected to become test + set and save
/// at least 1 instruction.
static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
assert(And->getOpcode() == ISD::AND && "Expected an 'and' op");
// This is probably not worthwhile without a supported type.
EVT VT = And->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(VT))
return SDValue();
// Look through an optional extension and find a 'not'.
// TODO: Should we favor test+set even without the 'not' op?
SDValue Not = And->getOperand(0), And1 = And->getOperand(1);
if (Not.getOpcode() == ISD::ANY_EXTEND)
Not = Not.getOperand(0);
if (!isBitwiseNot(Not) || !Not.hasOneUse() || !isOneConstant(And1))
return SDValue();
// Look though an optional truncation. The source operand may not be the same
// type as the original 'and', but that is ok because we are masking off
// everything but the low bit.
SDValue Srl = Not.getOperand(0);
if (Srl.getOpcode() == ISD::TRUNCATE)
Srl = Srl.getOperand(0);
// Match a shift-right by constant.
if (Srl.getOpcode() != ISD::SRL || !Srl.hasOneUse() ||
return SDValue();
// We might have looked through casts that make this transform invalid.
// TODO: If the source type is wider than the result type, do the mask and
// compare in the source type.
const APInt &ShiftAmt = Srl.getConstantOperandAPInt(1);
unsigned VTBitWidth = VT.getSizeInBits();
if (ShiftAmt.uge(VTBitWidth))
return SDValue();
// Turn this into a bit-test pattern using mask op + setcc:
// and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
SDLoc DL(And);
SDValue X = DAG.getZExtOrTrunc(Srl.getOperand(0), DL, VT);
EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
SDValue Mask = DAG.getConstant(
APInt::getOneBitSet(VTBitWidth, ShiftAmt.getZExtValue()), DL, VT);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask);
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ);
return DAG.getZExtOrTrunc(Setcc, DL, VT);
SDValue DAGCombiner::visitAND(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N1.getValueType();
// x & x --> x
if (N0 == N1)
return N0;
// fold vector ops
if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (and x, 0) -> 0, vector edition
if (ISD::isBuildVectorAllZeros(N0.getNode()))
// do not return N0, because undef node may exist in N0
return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()),
SDLoc(N), N0.getValueType());
if (ISD::isBuildVectorAllZeros(N1.getNode()))
// do not return N1, because undef node may exist in N1
return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()),
SDLoc(N), N1.getValueType());
// fold (and x, -1) -> x, vector edition
if (ISD::isBuildVectorAllOnes(N0.getNode()))
return N1;
if (ISD::isBuildVectorAllOnes(N1.getNode()))
return N0;
// fold (and c1, c2) -> c1&c2
ConstantSDNode *N1C = isConstOrConstSplat(N1);
if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
return C;
// canonicalize constant to RHS
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
// fold (and x, -1) -> x
if (isAllOnesConstant(N1))
return N0;
// if (and x, c) is known to be zero, return 0
unsigned BitWidth = VT.getScalarSizeInBits();
if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
return DAG.getConstant(0, SDLoc(N), VT);
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// reassociate and
if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
return RAND;
// Try to convert a constant mask AND into a shuffle clear mask.
if (VT.isVector())
if (SDValue Shuffle = XformToShuffleWithZero(N))
return Shuffle;
if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
return Combined;
// fold (and (or x, C), D) -> D if (C & D) == D
auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
if (N0.getOpcode() == ISD::OR &&
ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
return N1;
// fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
SDValue N0Op0 = N0.getOperand(0);
APInt Mask = ~N1C->getAPIntValue();
Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits());
if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
N0.getValueType(), N0Op0);
// Replace uses of the AND with uses of the Zero extend node.
CombineTo(N, Zext);
// We actually want to replace all uses of the any_extend with the
// zero_extend, to avoid duplicating things. This will later cause this
// AND to be folded.
CombineTo(N0.getNode(), Zext);
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
// (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
// already be zero by virtue of the width of the base type of the load.
// the 'X' node here can either be nothing or an extract_vector_elt to catch
// more cases.
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
N0.getOperand(0).getOpcode() == ISD::LOAD &&
N0.getOperand(0).getResNo() == 0) ||
(N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
N0 : N0.getOperand(0) );
// Get the constant (if applicable) the zero'th operand is being ANDed with.
// This can be a pure constant or a vector splat, in which case we treat the
// vector as a scalar and use the splat value.
APInt Constant = APInt::getNullValue(1);
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
Constant = C->getAPIntValue();
} else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
APInt SplatValue, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
SplatBitSize, HasAnyUndefs);
if (IsSplat) {
// Undef bits can contribute to a possible optimisation if set, so
// set them.
SplatValue |= SplatUndef;
// The splat value may be something like "0x00FFFFFF", which means 0 for
// the first vector value and FF for the rest, repeating. We need a mask
// that will apply equally to all members of the vector, so AND all the
// lanes of the constant together.
unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits();
// If the splat value has been compressed to a bitlength lower
// than the size of the vector lane, we need to re-expand it to
// the lane size.
if (EltBitWidth > SplatBitSize)
for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth);
SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2)
SplatValue |= SplatValue.shl(SplatBitSize);
// Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
// multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
if ((SplatBitSize % EltBitWidth) == 0) {
Constant = APInt::getAllOnesValue(EltBitWidth);
for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i)
Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth);
// If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
// actually legal and isn't going to get expanded, else this is a false
// optimisation.
bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
// Resize the constant to the same size as the original memory access before
// extension. If it is still the AllOnesValue then this AND is completely
// unneeded.
Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits());
bool B;
switch (Load->getExtensionType()) {
default: B = false; break;
case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
case ISD::NON_EXTLOAD: B = true; break;
if (B && Constant.isAllOnesValue()) {
// If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
// preserve semantics once we get rid of the AND.
SDValue NewLoad(Load, 0);
// Fold the AND away. NewLoad may get replaced immediately.
CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
if (Load->getExtensionType() == ISD::EXTLOAD) {
NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
Load->getValueType(0), SDLoc(Load),
Load->getChain(), Load->getBasePtr(),
Load->getOffset(), Load->getMemoryVT(),
// Replace uses of the EXTLOAD with the new ZEXTLOAD.
if (Load->getNumValues() == 3) {
// PRE/POST_INC loads have 3 values.
SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
NewLoad.getValue(2) };
CombineTo(Load, To, 3, true);
} else {
CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// fold (and (load x), 255) -> (zextload x, i8)
// fold (and (extload x, i16), 255) -> (zextload x, i8)
// fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD ||
(N0.getOpcode() == ISD::ANY_EXTEND &&
N0.getOperand(0).getOpcode() == ISD::LOAD))) {
if (SDValue Res = ReduceLoadWidth(N)) {
LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 0), Res);
return SDValue(N, 0);
if (LegalTypes) {
// Attempt to propagate the AND back up to the leaves which, if they're
// loads, can be combined to narrow loads and the AND node can be removed.
// Perform after legalization so that extend nodes will already be
// combined into the loads.
if (BackwardsPropagateMask(N))
return SDValue(N, 0);
if (SDValue Combined = visitANDLike(N0, N1, N))
return Combined;
// Simplify: (and (op x...), (op y...)) -> (op (and x, y))
if (N0.getOpcode() == N1.getOpcode())
if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
return V;
// Masking the negated extension of a boolean is just the zero-extended
// boolean:
// and (sub 0, zext(bool X)), 1 --> zext(bool X)
// and (sub 0, sext(bool X)), 1 --> zext(bool X)
// Note: the SimplifyDemandedBits fold below can make an information-losing
// transform, and then we have no way to find this better fold.
if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
if (isNullOrNullSplat(N0.getOperand(0))) {
SDValue SubRHS = N0.getOperand(1);
if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
return SubRHS;
if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
// fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
// fold (and (sra)) -> (and (srl)) when possible.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
// fold (zext_inreg (extload x)) -> (zextload x)
// fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
if (ISD::isUNINDEXEDLoad(N0.getNode()) &&
(ISD::isEXTLoad(N0.getNode()) ||
(ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
EVT MemVT = LN0->getMemoryVT();
// If we zero all the possible extended bits, then we can turn this into
// a zextload if we are running before legalize or the operation is legal.
unsigned ExtBitSize = N1.getScalarValueSizeInBits();
unsigned MemBitSize = MemVT.getScalarSizeInBits();
APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize);
if (DAG.MaskedValueIsZero(N1, ExtBits) &&
((!LegalOperations && LN0->isSimple()) ||
TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
SDValue ExtLoad =
DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(),
LN0->getBasePtr(), MemVT, LN0->getMemOperand());
CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
N0.getOperand(1), false))
return BSwap;
if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
return Shifts;
if (TLI.hasBitTest(N0, N1))
if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
return V;
return SDValue();
/// Match (a >> 8) | (a << 8) as (bswap a) >> 16.
SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
bool DemandHighBits) {
if (!LegalOperations)
return SDValue();
EVT VT = N->getValueType(0);
if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
return SDValue();
if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
return SDValue();
// Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
bool LookPassAnd0 = false;
bool LookPassAnd1 = false;
if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
std::swap(N0, N1);
if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
std::swap(N0, N1);
if (N0.getOpcode() == ISD::AND) {
if (!N0.getNode()->hasOneUse())
return SDValue();
ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
// Also handle 0xffff since the LHS is guaranteed to have zeros there.
// This is needed for X86.
if (!N01C || (N01C->getZExtValue() != 0xFF00 &&
N01C->getZExtValue() != 0xFFFF))
return SDValue();
N0 = N0.getOperand(0);
LookPassAnd0 = true;
if (N1.getOpcode() == ISD::AND) {
if (!N1.getNode()->hasOneUse())
return SDValue();
ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
if (!N11C || N11C->getZExtValue() != 0xFF)
return SDValue();
N1 = N1.getOperand(0);
LookPassAnd1 = true;
if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
std::swap(N0, N1);
if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
return SDValue();
if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse())
return SDValue();
ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
if (!N01C || !N11C)
return SDValue();
if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8)
return SDValue();
// Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
SDValue N00 = N0->getOperand(0);
if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
if (!N00.getNode()->hasOneUse())
return SDValue();
ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
if (!N001C || N001C->getZExtValue() != 0xFF)
return SDValue();
N00 = N00.getOperand(0);
LookPassAnd0 = true;
SDValue N10 = N1->getOperand(0);
if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
if (!N10.getNode()->hasOneUse())
return SDValue();
ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
// Also allow 0xFFFF since the bits will be shifted out. This is needed
// for X86.
if (!N101C || (N101C->getZExtValue() != 0xFF00 &&
N101C->getZExtValue() != 0xFFFF))
return SDValue();
N10 = N10.getOperand(0);
LookPassAnd1 = true;
if (N00 != N10)
return SDValue();
// Make sure everything beyond the low halfword gets set to zero since the SRL
// 16 will clear the top bits.
unsigned OpSizeInBits = VT.getSizeInBits();
if (DemandHighBits && OpSizeInBits > 16) {
// If the left-shift isn't masked out then the only way this is a bswap is
// if all bits beyond the low 8 are 0. In that case the entire pattern
// reduces to a left shift anyway: leave it for other parts of the combiner.
if (!LookPassAnd0)
return SDValue();
// However, if the right shift isn't masked out then it might be because
// it's not needed. See if we can spot that too.
if (!LookPassAnd1 &&
N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
return SDValue();
SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
if (OpSizeInBits > 16) {
SDLoc DL(N);
Res = DAG.getNode(ISD::SRL, DL, VT, Res,
DAG.getConstant(OpSizeInBits - 16, DL,
return Res;
/// Return true if the specified node is an element that makes up a 32-bit
/// packed halfword byteswap.
/// ((x & 0x000000ff) << 8) |
/// ((x & 0x0000ff00) >> 8) |
/// ((x & 0x00ff0000) << 8) |
/// ((x & 0xff000000) >> 8)
static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
if (!N.getNode()->hasOneUse())
return false;
unsigned Opc = N.getOpcode();
if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
return false;
SDValue N0 = N.getOperand(0);
unsigned Opc0 = N0.getOpcode();
if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
return false;
ConstantSDNode *N1C = nullptr;
// SHL or SRL: look upstream for AND mask operand
if (Opc == ISD::AND)
N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
else if (Opc0 == ISD::AND)
N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!N1C)
return false;
unsigned MaskByteOffset;
switch (N1C->getZExtValue()) {
return false;
case 0xFF: MaskByteOffset = 0; break;
case 0xFF00: MaskByteOffset = 1; break;
case 0xFFFF:
// In case demanded bits didn't clear the bits that will be shifted out.
// This is needed for X86.
if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) {
MaskByteOffset = 1;
return false;
case 0xFF0000: MaskByteOffset = 2; break;
case 0xFF000000: MaskByteOffset = 3; break;
// Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
if (Opc == ISD::AND) {
if (MaskByteOffset == 0 || MaskByteOffset == 2) {
// (x >> 8) & 0xff
// (x >> 8) & 0xff0000
if (Opc0 != ISD::SRL)
return false;
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!C || C->getZExtValue() != 8)
return false;
} else {
// (x << 8) & 0xff00
// (x << 8) & 0xff000000
if (Opc0 != ISD::SHL)
return false;
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!C || C->getZExtValue() != 8)
return false;
} else if (Opc == ISD::SHL) {
// (x & 0xff) << 8
// (x & 0xff0000) << 8
if (MaskByteOffset != 0 && MaskByteOffset != 2)
return false;
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
if (!C || C->getZExtValue() != 8)
return false;
} else { // Opc == ISD::SRL
// (x & 0xff00) >> 8
// (x & 0xff000000) >> 8
if (MaskByteOffset != 1 && MaskByteOffset != 3)
return false;
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
if (!C || C->getZExtValue() != 8)
return false;
if (Parts[MaskByteOffset])
return false;
Parts[MaskByteOffset] = N0.getOperand(0).getNode();
return true;
// Match 2 elements of a packed halfword bswap.
static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
if (N.getOpcode() == ISD::OR)
return isBSwapHWordElement(N.getOperand(0), Parts) &&
isBSwapHWordElement(N.getOperand(1), Parts);
if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) {
ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1));
if (!C || C->getAPIntValue() != 16)
return false;
Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode();
return true;
return false;
// Match this pattern:
// (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff))
// And rewrite this to:
// (rotr (bswap A), 16)
static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
SelectionDAG &DAG, SDNode *N, SDValue N0,
SDValue N1, EVT VT, EVT ShiftAmountTy) {
assert(N->getOpcode() == ISD::OR && VT == MVT::i32 &&
"MatchBSwapHWordOrAndAnd: expecting i32");
if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
return SDValue();
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
return SDValue();
// TODO: this is too restrictive; lifting this restriction requires more tests
if (!N0->hasOneUse() || !N1->hasOneUse())
return SDValue();
ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1));
ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1));
if (!Mask0 || !Mask1)
return SDValue();
if (Mask0->getAPIntValue() != 0xff00ff00 ||
Mask1->getAPIntValue() != 0x00ff00ff)
return SDValue();
SDValue Shift0 = N0.getOperand(0);
SDValue Shift1 = N1.getOperand(0);
if (Shift0.getOpcode() != ISD::SHL || Shift1.getOpcode() != ISD::SRL)
return SDValue();
ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1));
ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1));
if (!ShiftAmt0 || !ShiftAmt1)
return SDValue();
if (ShiftAmt0->getAPIntValue() != 8 || ShiftAmt1->getAPIntValue() != 8)
return SDValue();
if (Shift0.getOperand(0) != Shift1.getOperand(0))
return SDValue();
SDLoc DL(N);
SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0));
SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy);
return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
/// Match a 32-bit packed halfword bswap. That is
/// ((x & 0x000000ff) << 8) |
/// ((x & 0x0000ff00) >> 8) |
/// ((x & 0x00ff0000) << 8) |
/// ((x & 0xff000000) >> 8)
/// => (rotl (bswap x), 16)
SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
if (!LegalOperations)
return SDValue();
EVT VT = N->getValueType(0);
if (VT != MVT::i32)
return SDValue();
if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
return SDValue();
if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT,
return BSwap;
// Try again with commuted operands.
if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT,
return BSwap;
// Look for either
// (or (bswaphpair), (bswaphpair))
// (or (or (bswaphpair), (and)), (and))
// (or (or (and), (bswaphpair)), (and))
SDNode *Parts[4] = {};
if (isBSwapHWordPair(N0, Parts)) {
// (or (or (and), (and)), (or (and), (and)))
if (!isBSwapHWordPair(N1, Parts))
return SDValue();
} else if (N0.getOpcode() == ISD::OR) {
// (or (or (or (and), (and)), (and)), (and))
if (!isBSwapHWordElement(N1, Parts))
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) &&
!(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts)))
return SDValue();
} else
return SDValue();
// Make sure the parts are all coming from the same node.
if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
return SDValue();
SDLoc DL(N);
SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
SDValue(Parts[0], 0));
// Result of the bswap should be rotated by 16. If it's not legal, then
// do (x << 16) | (x >> 16).
SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
return DAG.getNode(ISD::OR, DL, VT,
DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
/// This contains all DAGCombine rules which reduce two values combined by
/// an Or operation to a single value \see visitANDLike().
SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
EVT VT = N1.getValueType();
SDLoc DL(N);
// fold (or x, undef) -> -1
if (!LegalOperations && (N0.isUndef() || N1.isUndef()))
return DAG.getAllOnesConstant(DL, VT);
if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
return V;
// (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible.
if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
// Don't increase # computations.
(N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
// We can only do this xform if we know that bits from X that are set in C2
// but not in C1 are already zero. Likewise for Y.
if (const ConstantSDNode *N0O1C =
getAsNonOpaqueConstant(N0.getOperand(1))) {
if (const ConstantSDNode *N1O1C =
getAsNonOpaqueConstant(N1.getOperand(1))) {
// We can only do this xform if we know that bits from X that are set in
// C2 but not in C1 are already zero. Likewise for Y.
const APInt &LHSMask = N0O1C->getAPIntValue();
const APInt &RHSMask = N1O1C->getAPIntValue();
if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
N0.getOperand(0), N1.getOperand(0));
return DAG.getNode(ISD::AND, DL, VT, X,
DAG.getConstant(LHSMask | RHSMask, DL, VT));
// (or (and X, M), (and X, N)) -> (and X, (or M, N))
if (N0.getOpcode() == ISD::AND &&
N1.getOpcode() == ISD::AND &&
N0.getOperand(0) == N1.getOperand(0) &&
// Don't increase # computations.
(N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
N0.getOperand(1), N1.getOperand(1));
return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
return SDValue();
/// OR combines for which the commuted variant will be tried as well.
static SDValue visitORCommutative(
SelectionDAG &DAG, SDValue N0, SDValue N1, SDNode *N) {
EVT VT = N0.getValueType();
if (N0.getOpcode() == ISD::AND) {
// fold (or (and X, (xor Y, -1)), Y) -> (or X, Y)
if (isBitwiseNot(N0.getOperand(1)) && N0.getOperand(1).getOperand(0) == N1)
return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(0), N1);
// fold (or (and (xor Y, -1), X), Y) -> (or X, Y)
if (isBitwiseNot(N0.getOperand(0)) && N0.getOperand(0).getOperand(0) == N1)
return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(1), N1);
return SDValue();
SDValue DAGCombiner::visitOR(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N1.getValueType();
// x | x --> x
if (N0 == N1)
return N0;
// fold vector ops
if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (or x, 0) -> x, vector edition
if (ISD::isBuildVectorAllZeros(N0.getNode()))
return N1;
if (ISD::isBuildVectorAllZeros(N1.getNode()))
return N0;
// fold (or x, -1) -> -1, vector edition
if (ISD::isBuildVectorAllOnes(N0.getNode()))
// do not return N0, because undef node may exist in N0
return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
if (ISD::isBuildVectorAllOnes(N1.getNode()))
// do not return N1, because undef node may exist in N1
return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
// fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
// Do this only if the resulting shuffle is legal.
if (isa<ShuffleVectorSDNode>(N0) &&
isa<ShuffleVectorSDNode>(N1) &&
// Avoid folding a node with illegal type.
TLI.isTypeLegal(VT)) {
bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
// Ensure both shuffles have a zero input.
if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
bool CanFold = true;
int NumElts = VT.getVectorNumElements();
SmallVector<int, 4> Mask(NumElts);
for (int i = 0; i != NumElts; ++i) {
int M0 = SV0->getMaskElt(i);
int M1 = SV1->getMaskElt(i);
// Determine if either index is pointing to a zero vector.
bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts));
bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts));
// If one element is zero and the otherside is undef, keep undef.
// This also handles the case that both are undef.
if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) {
Mask[i] = -1;
// Make sure only one of the elements is zero.
if (M0Zero == M1Zero) {
CanFold = false;
assert((M0 >= 0 || M1 >= 0) && "Undef index!");
// We have a zero and non-zero element. If the non-zero came from
// SV0 make the index a LHS index. If it came from SV1, make it
// a RHS index. We need to mod by NumElts because we don't care
// which operand it came from in the original shuffles.
Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
if (CanFold) {
SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
SDValue LegalShuffle =
TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS,
Mask, DAG);
if (LegalShuffle)
return LegalShuffle;
// fold (or c1, c2) -> c1|c2
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))
return C;
// canonicalize constant to RHS
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
// fold (or x, 0) -> x
if (isNullConstant(N1))
return N0;
// fold (or x, -1) -> -1
if (isAllOnesConstant(N1))
return N1;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// fold (or x, c) -> c iff (x & ~c) == 0
if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
return N1;
if (SDValue Combined = visitORLike(N0, N1, N))
return Combined;
if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
return Combined;
// Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
return BSwap;
if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
return BSwap;
// reassociate or
if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags()))
return ROR;
// Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
// iff (c1 & c2) != 0 or c1/c2 are undef.
auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) {
return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue());
if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
{N1, N0.getOperand(1)})) {
SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
if (SDValue Combined = visitORCommutative(DAG, N0, N1, N))
return Combined;
if (SDValue Combined = visitORCommutative(DAG, N1, N0, N))
return Combined;
// Simplify: (or (op x...), (op y...)) -> (op (or x, y))
if (N0.getOpcode() == N1.getOpcode())
if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
return V;
// See if this is some rotate idiom.
if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N)))
return Rot;
if (SDValue Load = MatchLoadCombine(N))
return Load;
// Simplify the operands using demanded-bits information.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
// If OR can be rewritten into ADD, try combines based on ADD.
if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
DAG.haveNoCommonBitsSet(N0, N1))
if (SDValue Combined = visitADDLike(N))
return Combined;
return SDValue();
static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) {
if (Op.getOpcode() == ISD::AND &&
DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
Mask = Op.getOperand(1);
return Op.getOperand(0);
return Op;
/// Match "(X shl/srl V1) & V2" where V2 may not be present.
static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift,
SDValue &Mask) {
Op = stripConstantMask(DAG, Op, Mask);
if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
Shift = Op;
return true;
return false;
/// Helper function for visitOR to extract the needed side of a rotate idiom
/// from a shl/srl/mul/udiv. This is meant to handle cases where
/// InstCombine merged some outside op with one of the shifts from
/// the rotate pattern.
/// \returns An empty \c SDValue if the needed shift couldn't be extracted.
/// Otherwise, returns an expansion of \p ExtractFrom based on the following
/// patterns:
/// (or (add v v) (shrl v bitwidth-1)):
/// expands (add v v) -> (shl v 1)
/// (or (mul v c0) (shrl (mul v c1) c2)):
/// expands (mul v c0) -> (shl (mul v c1) c3)
/// (or (udiv v c0) (shl (udiv v c1) c2)):
/// expands (udiv v c0) -> (shrl (udiv v c1) c3)
/// (or (shl v c0) (shrl (shl v c1) c2)):
/// expands (shl v c0) -> (shl (shl v c1) c3)
/// (or (shrl v c0) (shl (shrl v c1) c2)):
/// expands (shrl v c0) -> (shrl (shrl v c1) c3)
/// Such that in all cases, c3+c2==bitwidth(op v c1).
static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
SDValue ExtractFrom, SDValue &Mask,
const SDLoc &DL) {
assert(OppShift && ExtractFrom && "Empty SDValue");
(OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) &&
"Existing shift must be valid as a rotate half");
ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);
// Value and Type of the shift.
SDValue OppShiftLHS = OppShift.getOperand(0);
EVT ShiftedVT = OppShiftLHS.getValueType();
// Amount of the existing shift.
ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
// (add v v) -> (shl v 1)
// TODO: Should this be a general DAG canonicalization?
if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&
ExtractFrom.getOpcode() == ISD::ADD &&
ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&
ExtractFrom.getOperand(0) == OppShiftLHS &&
OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1)
return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS,
DAG.getShiftAmountConstant(1, ShiftedVT, DL));
// Preconditions:
// (or (op0 v c0) (shiftl/r (op0 v c1) c2))
// Find opcode of the needed shift to be extracted from (op0 v c0).
unsigned Opcode = ISD::DELETED_NODE;
bool IsMulOrDiv = false;
// Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift
// opcode or its arithmetic (mul or udiv) variant.
auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) {
IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant;
if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift)
return false;
Opcode = NeededShift;
return true;
// op0 must be either the needed shift opcode or the mul/udiv equivalent
// that the needed shift can be extracted from.
if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) &&
(OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV)))
return SDValue();
// op0 must be the same opcode on both sides, have the same LHS argument,
// and produce the same value type.
if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() ||
OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) ||
ShiftedVT != ExtractFrom.getValueType())
return SDValue();
// Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
// Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
ConstantSDNode *ExtractFromCst =
// TODO: We should be able to handle non-uniform constant vectors for these values
// Check that we have constant values.
if (!OppShiftCst || !OppShiftCst->getAPIntValue() ||
!OppLHSCst || !OppLHSCst->getAPIntValue() ||
!ExtractFromCst || !ExtractFromCst->getAPIntValue())
return SDValue();
// Compute the shift amount we need to extract to complete the rotate.
const unsigned VTWidth = ShiftedVT.getScalarSizeInBits();
if (OppShiftCst->getAPIntValue().ugt(VTWidth))
return SDValue();
APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue();
// Normalize the bitwidth of the two mul/udiv/shift constant operands.
APInt ExtractFromAmt = ExtractFromCst->getAPIntValue();
APInt OppLHSAmt = OppLHSCst->getAPIntValue();
zeroExtendToMatch(ExtractFromAmt, OppLHSAmt);
// Now try extract the needed shift from the ExtractFrom op and see if the
// result matches up with the existing shift's LHS op.
if (IsMulOrDiv) {
// Op to extract from is a mul or udiv by a constant.
// Check:
// c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0
// c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0
const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(),
APInt ResultAmt;
APInt Rem;
APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem);
if (Rem != 0 || ResultAmt != OppLHSAmt)
return SDValue();
} else {
// Op to extract from is a shift by a constant.
// Check:
// c2 - (bitwidth(op0 v c0) - c1) == c0
if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc(
return SDValue();
// Return the expanded shift op that should allow a rotate to be formed.
EVT ShiftVT = OppShift.getOperand(1).getValueType();
EVT ResVT = ExtractFrom.getValueType();
SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT);
return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode);
// Return true if we can prove that, whenever Neg and Pos are both in the
// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
// for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
// (or (shift1 X, Neg), (shift2 X, Pos))
// reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
// in direction shift1 by Neg. The range [0, EltSize) means that we only need
// to consider shift amounts with defined behavior.
static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
SelectionDAG &DAG) {
// If EltSize is a power of 2 then:
// (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
// (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
// So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
// for the stronger condition:
// Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A]
// for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
// we can just replace Neg with Neg' for the rest of the function.
// In other cases we check for the even stronger condition:
// Neg == EltSize - Pos [B]
// for all Neg and Pos. Note that the (or ...) then invokes undefined
// behavior if Pos == 0 (and consequently Neg == EltSize).
// We could actually use [A] whenever EltSize is a power of 2, but the
// only extra cases that it would match are those uninteresting ones
// where Neg and Pos are never in range at the same time. E.g. for
// EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
// as well as (sub 32, Pos), but:
// (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
// always invokes undefined behavior for 32-bit X.
// Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
unsigned MaskLoBits = 0;
if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
KnownBits Known = DAG.computeKnownBits(Neg.getOperand(0));
unsigned Bits = Log2_64(EltSize);
if (NegC->getAPIntValue().getActiveBits() <= Bits &&
((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) {
Neg = Neg.getOperand(0);
MaskLoBits = Bits;
// Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
if (Neg.getOpcode() != ISD::SUB)
return false;
ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
if (!NegC)
return false;
SDValue NegOp1 = Neg.getOperand(1);
// On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
// Pos'. The truncation is redundant for the purpose of the equality.
if (MaskLoBits && Pos.getOpcode() == ISD::AND) {
if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) {
KnownBits Known = DAG.computeKnownBits(Pos.getOperand(0));
if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits &&
((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >=
Pos = Pos.getOperand(0);
// The condition we need is now:
// (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
// If NegOp1 == Pos then we need:
// EltSize & Mask == NegC & Mask
// (because "x & Mask" is a truncation and distributes through subtraction).
// We also need to account for a potential truncation of NegOp1 if the amount
// has already been legalized to a shift amount type.
APInt Width;
if ((Pos == NegOp1) ||
(NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0)))
Width = NegC->getAPIntValue();
// Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
// Then the condition we want to prove becomes:
// (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
// which, again because "x & Mask" is a truncation, becomes:
// NegC & Mask == (EltSize - PosC) & Mask
// EltSize & Mask == (NegC + PosC) & Mask
else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
Width = PosC->getAPIntValue() + NegC->getAPIntValue();
return false;
} else
return false;
// Now we just need to check that EltSize & Mask == Width & Mask.
if (MaskLoBits)
// EltSize & Mask is 0 since Mask is EltSize - 1.
return Width.getLoBits(MaskLoBits) == 0;
return Width == EltSize;
// A subroutine of MatchRotate used once we have found an OR of two opposite
// shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces
// to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
// former being preferred if supported. InnerPos and InnerNeg are Pos and
// Neg with outer conversions stripped away.
SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
SDValue Neg, SDValue InnerPos,
SDValue InnerNeg, unsigned PosOpcode,
unsigned NegOpcode, const SDLoc &DL) {
// fold (or (shl x, (*ext y)),
// (srl x, (*ext (sub 32, y)))) ->
// (rotl x, y) or (rotr x, (sub 32, y))
// fold (or (shl x, (*ext (sub 32, y))),
// (srl x, (*ext y))) ->
// (rotr x, y) or (rotl x, (sub 32, y))
EVT VT = Shifted.getValueType();
if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
HasPos ? Pos : Neg);
return SDValue();
// A subroutine of MatchRotate used once we have found an OR of two opposite
// shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces
// to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
// former being preferred if supported. InnerPos and InnerNeg are Pos and
// Neg with outer conversions stripped away.
// TODO: Merge with MatchRotatePosNeg.
SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
SDValue Neg, SDValue InnerPos,
SDValue InnerNeg, unsigned PosOpcode,
unsigned NegOpcode, const SDLoc &DL) {
EVT VT = N0.getValueType();
unsigned EltBits = VT.getScalarSizeInBits();
// fold (or (shl x0, (*ext y)),
// (srl x1, (*ext (sub 32, y)))) ->
// (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
// fold (or (shl x0, (*ext (sub 32, y))),
// (srl x1, (*ext y))) ->
// (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) {
bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
HasPos ? Pos : Neg);
// Matching the shift+xor cases, we can't easily use the xor'd shift amount
// so for now just use the PosOpcode case if its legal.
// TODO: When can we use the NegOpcode case?
if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {
auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) {
if (Op.getOpcode() != BinOpc)
return false;
ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1));
return Cst && (Cst->getAPIntValue() == Imm);
// fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
// -> (fshl x0, x1, y)
if (IsBinOpImm(N1, ISD::SRL, 1) &&
IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
InnerPos == InnerNeg.getOperand(0) &&
TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) {
return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
// fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
// -> (fshr x0, x1, y)
if (IsBinOpImm(N0, ISD::SHL, 1) &&
IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
InnerNeg == InnerPos.getOperand(0) &&
TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
// fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
// -> (fshr x0, x1, y)
// TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
InnerNeg == InnerPos.getOperand(0) &&
TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
return SDValue();
// MatchRotate - Handle an 'or' of two operands. If this is one of the many
// idioms for rotate, and if the target supports rotation instructions, generate
// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
// with different shifted sources.
SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
// Must be a legal type. Expanded 'n promoted things won't work with rotates.
EVT VT = LHS.getValueType();
if (!TLI.isTypeLegal(VT))
return SDValue();
// The target must have at least one rotate/funnel flavor.
bool HasROTL = hasOperation(ISD::ROTL, VT);
bool HasROTR = hasOperation(ISD::ROTR, VT);
bool HasFSHL = hasOperation(ISD::FSHL, VT);
bool HasFSHR = hasOperation(ISD::FSHR, VT);
if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
return SDValue();
// Check for truncated rotate.
if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
assert(LHS.getValueType() == RHS.getValueType());
if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
// Match "(X shl/srl V1) & V2" where V2 may not be present.
SDValue LHSShift; // The shift.
SDValue LHSMask; // AND value if any.
matchRotateHalf(DAG, LHS, LHSShift, LHSMask);
SDValue RHSShift; // The shift.
SDValue RHSMask; // AND value if any.
matchRotateHalf(DAG, RHS, RHSShift, RHSMask);
// If neither side matched a rotate half, bail
if (!LHSShift && !RHSShift)
return SDValue();
// InstCombine may have combined a constant shl, srl, mul, or udiv with one
// side of the rotate, so try to handle that here. In all cases we need to
// pass the matched shift from the opposite side to compute the opcode and
// needed shift amount to extract. We still want to do this if both sides
// matched a rotate half because one half may be a potential overshift that
// can be broken down (ie if InstCombine merged two shl or srl ops into a
// single one).
// Have LHS side of the rotate, try to extract the needed shift from the RHS.
if (LHSShift)
if (SDValue NewRHSShift =
extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL))
RHSShift = NewRHSShift;
// Have RHS side of the rotate, try to extract the needed shift from the LHS.
if (RHSShift)
if (SDValue NewLHSShift =
extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL))
LHSShift = NewLHSShift;
// If a side is still missing, nothing else we can do.
if (!RHSShift || !LHSShift)
return SDValue();
// At this point we've matched or extracted a shift op on each side.
if (LHSShift.getOpcode() == RHSShift.getOpcode())
return SDValue(); // Shifts must disagree.
bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
if (!IsRotate && !(HasFSHL || HasFSHR))
return SDValue(); // Requires funnel shift support.
// Canonicalize shl to left side in a shl/srl pair.
if (RHSShift.getOpcode() == ISD::SHL) {
std::swap(LHS, RHS);
std::swap(LHSShift, RHSShift);
std::swap(LHSMask, RHSMask);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
SDValue LHSShiftArg = LHSShift.getOperand(0);
SDValue LHSShiftAmt = LHSShift.getOperand(1);
SDValue RHSShiftArg = RHSShift.getOperand(0);
SDValue RHSShiftAmt = RHSShift.getOperand(1);
// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
// fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
// fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
// iff C1+C2 == EltSizeInBits
auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
SDValue Res;
if (IsRotate && (HasROTL || HasROTR))
Res = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
HasROTL ? LHSShiftAmt : RHSShiftAmt);
Res = DAG.getNode(HasFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
RHSShiftArg, HasFSHL ? LHSShiftAmt : RHSShiftAmt);
// If there is an AND of either shifted operand, apply it to the result.
if (LHSMask.getNode() || RHSMask.getNode()) {
SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
SDValue Mask = AllOnes;
if (LHSMask.getNode()) {
SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
if (RHSMask.getNode()) {
SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask);
return Res;
// If there is a mask here, and we have a variable shift, we can't be sure
// that we're masking out the right stuff.
if (LHSMask.getNode() || RHSMask.getNode())
return SDValue();
// If the shift amount is sign/zext/any-extended just peel it off.
SDValue LExtOp0 = LHSShiftAmt;
SDValue RExtOp0 = RHSShiftAmt;
if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
(RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
LExtOp0 = LHSShiftAmt.getOperand(0);
RExtOp0 = RHSShiftAmt.getOperand(0);
if (IsRotate && (HasROTL || HasROTR)) {
SDValue TryL =
MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
if (TryL)
return TryL;
SDValue TryR =
MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
if (TryR)
return TryR;
SDValue TryL =
MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL);
if (TryL)
return TryL;
SDValue TryR =
MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL);
if (TryR)
return TryR;
return SDValue();
namespace {
/// Represents known origin of an individual byte in load combine pattern. The
/// value of the byte is either constant zero or comes from memory.
struct ByteProvider {
// For constant zero providers Load is set to nullptr. For memory providers
// Load represents the node which loads the byte from memory.
// ByteOffset is the offset of the byte in the value produced by the load.
LoadSDNode *Load = nullptr;
unsigned ByteOffset = 0;
ByteProvider() = default;
static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
return ByteProvider(Load, ByteOffset);
static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
bool isConstantZero() const { return !Load; }
bool isMemory() const { return Load; }
bool operator==(const ByteProvider &Other) const {
return Other.Load == Load && Other.ByteOffset == ByteOffset;
ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
: Load(Load), ByteOffset(ByteOffset) {}
} // end anonymous namespace
/// Recursively traverses the expression calculating the origin of the requested
/// byte of the given value. Returns None if the provider can't be calculated.
/// For all the values except the root of the expression verifies that the value
/// has exactly one use and if it's not true return None. This way if the origin
/// of the byte is returned it's guaranteed that the values which contribute to
/// the byte are not used outside of this expression.
/// Because the parts of the expression are not allowed to have more than one
/// use this function iterates over trees, not DAGs. So it never visits the same
/// node more than once.
static const Optional<ByteProvider>
calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
bool Root = false) {
// Typical i64 by i8 pattern requires recursion up to 8 calls depth
if (Depth == 10)
return None;
if (!Root && !Op.hasOneUse())
return None;
assert(Op.getValueType().isScalarInteger() && "can't handle other types");
unsigned BitWidth = Op.getValueSizeInBits();
if (BitWidth % 8 != 0)
return None;
unsigned ByteWidth = BitWidth / 8;
assert(Index < ByteWidth && "invalid index requested");
(void) ByteWidth;
switch (Op.getOpcode()) {
case ISD::OR: {
auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
if (!LHS)
return None;
auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
if (!RHS)
return None;
if (LHS->isConstantZero())
return RHS;
if (RHS->isConstantZero())
return LHS;
return None;
case ISD::SHL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
return None;
uint64_t BitShift = ShiftOp->getZExtValue();
if (BitShift % 8 != 0)
return None;
uint64_t ByteShift = BitShift / 8;
return Index < ByteShift
? ByteProvider::getConstantZero()
: calculateByteProvider(Op->getOperand(0), Index - ByteShift,
Depth + 1);
SDValue NarrowOp = Op->getOperand(0);
unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
if (NarrowBitWidth % 8 != 0)
return None;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;
if (Index >= NarrowByteWidth)
return Op.getOpcode() == ISD::ZERO_EXTEND
? Optional<ByteProvider>(ByteProvider::getConstantZero())
: None;
return calculateByteProvider(NarrowOp, Index, Depth + 1);
case ISD::BSWAP:
return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
Depth + 1);
case ISD::LOAD: {
auto L = cast<LoadSDNode>(Op.getNode());
if (!L->isSimple() || L->isIndexed())
return None;
unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
if (NarrowBitWidth % 8 != 0)
return None;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;
if (Index >= NarrowByteWidth)
return L->getExtensionType() == ISD::ZEXTLOAD
? Optional<ByteProvider>(ByteProvider::getConstantZero())
: None;
return ByteProvider::getMemory(L, Index);
return None;
static unsigned LittleEndianByteAt(unsigned BW, unsigned i) {
return i;
static unsigned BigEndianByteAt(unsigned BW, unsigned i) {
return BW - i - 1;
// Check if the bytes offsets we are looking at match with either big or
// little endian value loaded. Return true for big endian, false for little
// endian, and None if match failed.
static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
int64_t FirstOffset) {
// The endian can be decided only when it is 2 bytes at least.
unsigned Width = ByteOffsets.size();
if (Width < 2)
return None;
bool BigEndian = true, LittleEndian = true;
for (unsigned i = 0; i < Width; i++) {
int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
LittleEndian &= CurrentByteOffset == LittleEndianByteAt(Width, i);
BigEndian &= CurrentByteOffset == BigEndianByteAt(Width, i);
if (!BigEndian && !LittleEndian)
return None;
assert((BigEndian != LittleEndian) && "It should be either big endian or"
"little endian");
return BigEndian;
static SDValue stripTruncAndExt(SDValue Value) {
switch (Value.getOpcode()) {
return stripTruncAndExt(Value.getOperand(0));
return Value;
/// Match a pattern where a wide type scalar value is stored by several narrow
/// stores. Fold it into a single store or a BSWAP and a store if the targets
/// supports it.
/// Assuming little endian target:
/// i8 *p = ...
/// i32 val = ...
/// p[0] = (val >> 0) & 0xFF;
/// p[1] = (val >> 8) & 0xFF;
/// p[2] = (val >> 16) & 0xFF;
/// p[3] = (val >> 24) & 0xFF;
/// =>
/// *((i32)p) = val;
/// i8 *p = ...
/// i32 val = ...
/// p[0] = (val >> 24) & 0xFF;
/// p[1] = (val >> 16) & 0xFF;
/// p[2] = (val >> 8) & 0xFF;
/// p[3] = (val >> 0) & 0xFF;
/// =>
/// *((i32)p) = BSWAP(val);
SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
// Collect all the stores in the chain.
SDValue Chain;
SmallVector<StoreSDNode *, 8> Stores;
for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
// TODO: Allow unordered atomics when wider type is legal (see D66309)
if (Store->getMemoryVT() != MVT::i8 ||
!Store->isSimple() || Store->isIndexed())
return SDValue();
Chain = Store->getChain();
// Handle the simple type only.
unsigned Width = Stores.size();
EVT VT = EVT::getIntegerVT(
*DAG.getContext(), Width * N->getMemoryVT().getSizeInBits());
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
return SDValue();
if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT))
return SDValue();
// Check if all the bytes of the combined value we are looking at are stored
// to the same base address. Collect bytes offsets from Base address into
// ByteOffsets.
SDValue CombinedValue;
SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX);
int64_t FirstOffset = INT64_MAX;
StoreSDNode *FirstStore = nullptr;
Optional<BaseIndexOffset> Base;
for (auto Store : Stores) {
// All the stores store different byte of the CombinedValue. A truncate is
// required to get that byte value.
SDValue Trunc = Store->getValue();
if (Trunc.getOpcode() != ISD::TRUNCATE)
return SDValue();
// A shift operation is required to get the right byte offset, except the
// first byte.
int64_t Offset = 0;
SDValue Value = Trunc.getOperand(0);
if (Value.getOpcode() == ISD::SRL ||
Value.getOpcode() == ISD::SRA) {
auto *ShiftOffset = dyn_cast<ConstantSDNode>(Value.getOperand(1));
// Trying to match the following pattern. The shift offset must be
// a constant and a multiple of 8. It is the byte offset in "y".
// x = srl y, offset
// i8 z = trunc x
// store z, ...
if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8))
return SDValue();
Offset = ShiftOffset->getSExtValue()/8;
Value = Value.getOperand(0);
// Stores must share the same combined value with different offsets.
if (!CombinedValue)
CombinedValue = Value;
else if (stripTruncAndExt(CombinedValue) != stripTruncAndExt(Value))
return SDValue();
// The trunc and all the extend operation should be stripped to get the
// real value we are stored.
else if (CombinedValue.getValueType() != VT) {
if (Value.getValueType() == VT ||
Value.getValueSizeInBits() > CombinedValue.getValueSizeInBits())
CombinedValue = Value;
// Give up if the combined value type is smaller than the store size.
if (CombinedValue.getValueSizeInBits() < VT.getSizeInBits())
return SDValue();
// Stores must share the same base address
BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG);
int64_t ByteOffsetFromBase = 0;
if (!Base)
Base = Ptr;
else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
return SDValue();
// Remember the first byte store
if (ByteOffsetFromBase < FirstOffset) {
FirstStore = Store;
FirstOffset = ByteOffsetFromBase;
// Map the offset in the store and the offset in the combined value, and
// early return if it has been set before.
if (Offset < 0 || Offset >= Width || ByteOffsets[Offset] != INT64_MAX)
return SDValue();
ByteOffsets[Offset] = ByteOffsetFromBase;
assert(FirstOffset != INT64_MAX && "First byte offset must be set");
assert(FirstStore && "First store must be set");
// Check if the bytes of the combined value we are looking at match with
// either big or little endian value store.
Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset);
if (!IsBigEndian.hasValue())
return SDValue();
// The node we are looking at matches with the pattern, check if we can
// replace it with a single bswap if needed and store.
// If the store needs byte swap check if the target supports it
bool NeedsBswap = DAG.getDataLayout().isBigEndian() != *IsBigEndian;
// Before legalize we can introduce illegal bswaps which will be later
// converted to an explicit bswap sequence. This way we end up with a single
// store and byte shuffling instead of several stores and byte shuffling.
if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
return SDValue();
// Check that a store of the wide type is both allowed and fast on the target
bool Fast = false;
bool Allowed =
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
*FirstStore->getMemOperand(), &Fast);
if (!Allowed || !Fast)
return SDValue();
if (VT != CombinedValue.getValueType()) {
assert(CombinedValue.getValueType().getSizeInBits() > VT.getSizeInBits() &&
"Get unexpected store value to combine");
CombinedValue = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
if (NeedsBswap)
CombinedValue = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, CombinedValue);
SDValue NewStore =
DAG.getStore(Chain, SDLoc(N), CombinedValue, FirstStore->getBasePtr(),
FirstStore->getPointerInfo(), FirstStore->getAlignment());
// Rely on other DAG combine rules to remove the other individual stores.
DAG.ReplaceAllUsesWith(N, NewStore.getNode());
return NewStore;
/// Match a pattern where a wide type scalar value is loaded by several narrow
/// loads and combined by shifts and ors. Fold it into a single load or a load
/// and a BSWAP if the targets supports it.
/// Assuming little endian target:
/// i8 *a = ...
/// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
/// =>
/// i32 val = *((i32)a)
/// i8 *a = ...
/// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
/// =>
/// i32 val = BSWAP(*((i32)a))
/// TODO: This rule matches complex patterns with OR node roots and doesn't
/// interact well with the worklist mechanism. When a part of the pattern is
/// updated (e.g. one of the loads) its direct users are put into the worklist,
/// but the root node of the pattern which triggers the load combine is not
/// necessarily a direct user of the changed node. For example, once the address
/// of t28 load is reassociated load combine won't be triggered:
/// t25: i32 = add t4, Constant:i32<2>
/// t26: i64 = sign_extend t25
/// t27: i64 = add t2, t26
/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
/// t29: i32 = zero_extend t28
/// t32: i32 = shl t29, Constant:i8<8>
/// t33: i32 = or t23, t32
/// As a possible fix visitLoad can check if the load can be a part of a load
/// combine pattern and add corresponding OR roots to the worklist.
SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
assert(N->getOpcode() == ISD::OR &&
"Can only match load combining against OR nodes");
// Handles simple types only
EVT VT = N->getValueType(0);
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
return SDValue();
unsigned ByteWidth = VT.getSizeInBits() / 8;
bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
auto MemoryByteOffset = [&] (ByteProvider P) {
assert(P.isMemory() && "Must be a memory byte provider");
unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
assert(LoadBitWidth % 8 == 0 &&
"can only analyze providers for individual bytes not bit");
unsigned LoadByteWidth = LoadBitWidth / 8;
return IsBigEndianTarget
? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
: LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
Optional<BaseIndexOffset> Base;
SDValue Chain;
SmallPtrSet<LoadSDNode *, 8> Loads;
Optional<ByteProvider> FirstByteProvider;
int64_t FirstOffset = INT64_MAX;
// Check if all the bytes of the OR we are looking at are loaded from the same
// base address. Collect bytes offsets from Base address in ByteOffsets.
SmallVector<int64_t, 8> ByteOffsets(ByteWidth);
unsigned ZeroExtendedBytes = 0;
for (int i = ByteWidth - 1; i >= 0; --i) {
auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true);
if (!P)
return SDValue();
if (P->isConstantZero()) {
// It's OK for the N most significant bytes to be 0, we can just
// zero-extend the load.
if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i)))
return SDValue();
assert(P->isMemory() && "provenance should either be memory or zero");
LoadSDNode *L = P->Load;
assert(L->hasNUsesOfValue(1, 0) && L->isSimple() &&
!L->isIndexed() &&
"Must be enforced by calculateByteProvider");
assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
// All loads must share the same chain
SDValue LChain = L->getChain();
if (!Chain)
Chain = LChain;
else if (Chain != LChain)
return SDValue();
// Loads must share the same base address
BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
int64_t ByteOffsetFromBase = 0;
if (!Base)
Base = Ptr;
else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
return SDValue();
// Calculate the offset of the current byte from the base address
ByteOffsetFromBase += MemoryByteOffset(*P);
ByteOffsets[i] = ByteOffsetFromBase;
// Remember the first byte load
if (ByteOffsetFromBase < FirstOffset) {
FirstByteProvider = P;
FirstOffset = ByteOffsetFromBase;
assert(!Loads.empty() && "All the bytes of the value must be loaded from "
"memory, so there must be at least one load which produces the value");
assert(Base && "Base address of the accessed memory location must be set");
assert(FirstOffset != INT64_MAX && "First byte offset must be set");
bool NeedsZext = ZeroExtendedBytes > 0;
EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8);
if (!MemVT.isSimple())
return SDValue();
// Before legalize we can introduce too wide illegal loads which will be later
// split into legal sized loads. This enables us to combine i64 load by i8
// patterns to a couple of i32 loads on 32 bit targets.
if (LegalOperations &&
!TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD,
return SDValue();
// Check if the bytes of the OR we are looking at match with either big or
// little endian value load
Optional<bool> IsBigEndian = isBigEndian(
makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
if (!IsBigEndian.hasValue())
return SDValue();
assert(FirstByteProvider && "must be set");
// Ensure that the first byte is loaded from zero offset of the first load.
// So the combined value can be loaded from the first load address.
if (MemoryByteOffset(*FirstByteProvider) != 0)
return SDValue();
LoadSDNode *FirstLoad = FirstByteProvider->Load;
// The node we are looking at matches with the pattern, check if we can
// replace it with a single (possibly zero-extended) load and bswap + shift if
// needed.
// If the load needs byte swap check if the target supports it
bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;
// Before legalize we can introduce illegal bswaps which will be later
// converted to an explicit bswap sequence. This way we end up with a single
// load and byte shuffling instead of several loads and byte shuffling.
// We do not introduce illegal bswaps when zero-extending as this tends to
// introduce too many arithmetic instructions.
if (NeedsBswap && (LegalOperations || NeedsZext) &&
!TLI.isOperationLegal(ISD::BSWAP, VT))
return SDValue();
// If we need to bswap and zero extend, we have to insert a shift. Check that
// it is legal.
if (NeedsBswap && NeedsZext && LegalOperations &&
!TLI.isOperationLegal(ISD::SHL, VT))
return SDValue();
// Check that a load of the wide type is both allowed and fast on the target
bool Fast = false;
bool Allowed =
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
*FirstLoad->getMemOperand(), &Fast);
if (!Allowed || !Fast)
return SDValue();
SDValue NewLoad = DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD,
SDLoc(N), VT, Chain, FirstLoad->getBasePtr(),
FirstLoad->getPointerInfo(), MemVT,
// Transfer chain users from old loads to the new load.
for (LoadSDNode *L : Loads)
DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
if (!NeedsBswap)
return NewLoad;
SDValue ShiftedLoad =
? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT,
SDLoc(N), LegalOperations))
: NewLoad;
return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
// If the target has andn, bsl, or a similar bit-select instruction,
// we want to unfold masked merge, with canonical pattern of:
// | A | |B|
// ((x ^ y) & m) ^ y
// | D |
// Into:
// (x & m) | (y & ~m)
// If y is a constant, and the 'andn' does not work with immediates,
// we unfold into a different pattern:
// ~(~x & m) & (m | y)
// NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at
// the very least that breaks andnpd / andnps patterns, and because those
// patterns are simplified in IR and shouldn't be created in the DAG
SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
assert(N->getOpcode() == ISD::XOR);
// Don't touch 'not' (i.e. where y = -1).
if (isAllOnesOrAllOnesSplat(N->getOperand(1)))
return SDValue();
EVT VT = N->getValueType(0);
// There are 3 commutable operators in the pattern,
// so we have to deal with 8 possible variants of the basic pattern.
SDValue X, Y, M;
auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) {
if (And.getOpcode() != ISD::AND || !And.hasOneUse())
return false;
SDValue Xor = And.getOperand(XorIdx);
if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse())
return false;
SDValue Xor0 = Xor.getOperand(0);
SDValue Xor1 = Xor.getOperand(1);
// Don't touch 'not' (i.e. where y = -1).
if (isAllOnesOrAllOnesSplat(Xor1))
return false;
if (Other == Xor0)
std::swap(Xor0, Xor1);
if (Other != Xor1)
return false;
X = Xor0;
Y = Xor1;
M = And.getOperand(XorIdx ? 0 : 1);
return true;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) &&
!matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0))
return SDValue();
// Don't do anything if the mask is constant. This should not be reachable.
// InstCombine should have already unfolded this pattern, and DAGCombiner
// probably shouldn't produce it, too.
if (isa<ConstantSDNode>(M.getNode()))
return SDValue();
// We can transform if the target has AndNot
if (!TLI.hasAndNot(M))
return SDValue();
SDLoc DL(N);
// If Y is a constant, check that 'andn' works with immediates.
if (!TLI.hasAndNot(Y)) {
assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable.");
// If not, we need to do a bit more work to make sure andn is still used.
SDValue NotX = DAG.getNOT(DL, X, VT);
SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M);
SDValue NotLHS = DAG.getNOT(DL, LHS, VT);
SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y);
return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS);
SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M);
SDValue NotM = DAG.getNOT(DL, M, VT);
SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM);
return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
SDValue DAGCombiner::visitXOR(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
// fold vector ops
if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (xor x, 0) -> x, vector edition
if (ISD::isBuildVectorAllZeros(N0.getNode()))
return N1;
if (ISD::isBuildVectorAllZeros(N1.getNode()))
return N0;
// fold (xor undef, undef) -> 0. This is a common idiom (misuse).
SDLoc DL(N);
if (N0.isUndef() && N1.isUndef())
return DAG.getConstant(0, DL, VT);
// fold (xor x, undef) -> undef
if (N0.isUndef())
return N0;
if (N1.isUndef())
return N1;
// fold (xor c1, c2) -> c1^c2
if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1}))
return C;
// canonicalize constant to RHS
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
// fold (xor x, 0) -> x
if (isNullConstant(N1))
return N0;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// reassociate xor
if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
return RXOR;
// fold !(x cc y) -> (x !cc y)
unsigned N0Opcode = N0.getOpcode();
if (TLI.isConstTrueVal(N1.getNode()) &&
isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/true)) {
ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
if (!LegalOperations ||
TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
switch (N0Opcode) {
llvm_unreachable("Unhandled SetCC Equivalent!");
case ISD::SETCC:
return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
N0.getOperand(3), NotCC);
if (N0.hasOneUse()) {
// FIXME Can we handle multiple uses? Could we token factor the chain
// results from the new/old setcc?
SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
CombineTo(N, SetCC);
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() &&
isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
SDValue V = N0.getOperand(0);
SDLoc DL0(N0);
V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V,
DAG.getConstant(1, DL0, V.getValueType()));
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V);
// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
(N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) {
unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
return DAG.getNode(NewOpcode, DL, VT, N00, N01);
// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
if (isAllOnesConstant(N1) && N0.hasOneUse() &&
(N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) {
unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
return DAG.getNode(NewOpcode, DL, VT, N00, N01);
// fold (not (neg x)) -> (add X, -1)
// FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if
// Y is a constant or the subtract has a single use.
if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB &&
isNullConstant(N0.getOperand(0))) {
return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1),
DAG.getAllOnesConstant(DL, VT));
// fold (not (add X, -1)) -> (neg X)
if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD &&
isAllOnesOrAllOnesSplat(N0.getOperand(1))) {
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
// fold (xor (and x, y), y) -> (and (not x), y)
if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) {
SDValue X = N0.getOperand(0);
SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
return DAG.getNode(ISD::AND, DL, VT, NotX, N1);
if ((N0Opcode == ISD::SRL || N0Opcode == ISD::SHL) && N0.hasOneUse()) {
ConstantSDNode *XorC = isConstOrConstSplat(N1);
ConstantSDNode *ShiftC = isConstOrConstSplat(N0.getOperand(1));
unsigned BitWidth = VT.getScalarSizeInBits();
if (XorC && ShiftC) {
// Don't crash on an oversized shift. We can not guarantee that a bogus
// shift has been simplified to undef.
uint64_t ShiftAmt = ShiftC->getLimitedValue();
if (ShiftAmt < BitWidth) {
APInt Ones = APInt::getAllOnesValue(BitWidth);
Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt);
if (XorC->getAPIntValue() == Ones) {
// If the xor constant is a shifted -1, do a 'not' before the shift:
// xor (X << ShiftC), XorC --> (not X) << ShiftC
// xor (X >> ShiftC), XorC --> (not X) >> ShiftC
SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT);
return DAG.getNode(N0Opcode, DL, VT, Not, N0.getOperand(1));
// fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
SDValue A = N0Opcode == ISD::ADD ? N0 : N1;
SDValue S = N0Opcode == ISD::SRA ? N0 : N1;
if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
SDValue S0 = S.getOperand(0);
if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) {
unsigned OpSizeInBits = VT.getScalarSizeInBits();
if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
if (C->getAPIntValue() == (OpSizeInBits - 1))
return DAG.getNode(ISD::ABS, DL, VT, S0);
// fold (xor x, x) -> 0
if (N0 == N1)
return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
// fold (xor (shl 1, x), -1) -> (rotl ~1, x)
// Here is a concrete example of this equivalence:
// i16 x == 14
// i16 shl == 1 << 14 == 16384 == 0b0100000000000000
// i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
// =>
// i16 ~1 == 0b1111111111111110
// i16 rol(~1, 14) == 0b1011111111111111
// Some additional tips to help conceptualize this transform:
// - Try to see the operation as placing a single zero in a value of all ones.
// - There exists no value for x which would allow the result to contain zero.
// - Values of x larger than the bitwidth are undefined and do not require a
// consistent result.
// - Pushing the zero left requires shifting one bits in from the right.
// A rotate left of ~1 is a nice way of achieving the desired result.
if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL &&
isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
// Simplify: xor (op x...), (op y...) -> (op (xor x, y))
if (N0Opcode == N1.getOpcode())
if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
return V;
// Unfold ((x ^ y) & m) ^ y into (x & m) | (y & ~m) if profitable
if (SDValue MM = unfoldMaskedMerge(N))
return MM;
// Simplify the expression using non-local knowledge.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
return Combined;
return SDValue();
/// If we have a shift-by-constant of a bitwise logic op that itself has a
/// shift-by-constant operand with identical opcode, we may be able to convert
/// that into 2 independent shifts followed by the logic op. This is a
/// throughput improvement.
static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
// Match a one-use bitwise logic op.
SDValue LogicOp = Shift->getOperand(0);
if (!LogicOp.hasOneUse())
return SDValue();
unsigned LogicOpcode = LogicOp.getOpcode();
if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR &&
LogicOpcode != ISD::XOR)
return SDValue();
// Find a matching one-use shift by constant.
unsigned ShiftOpcode = Shift->getOpcode();
SDValue C1 = Shift->getOperand(1);
ConstantSDNode *C1Node = isConstOrConstSplat(C1);
assert(C1Node && "Expected a shift with constant operand");
const APInt &C1Val = C1Node->getAPIntValue();
auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp,
const APInt *&ShiftAmtVal) {
if (V.getOpcode() != ShiftOpcode || !V.hasOneUse())
return false;
ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1));
if (!ShiftCNode)
return false;
// Capture the shifted operand and shift amount value.
ShiftOp = V.getOperand(0);
ShiftAmtVal = &ShiftCNode->getAPIntValue();
// Shift amount types do not have to match their operand type, so check that
// the constants are the same width.
if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
return false;
// The fold is not valid if the sum of the shift values exceeds bitwidth.
if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
return false;
return true;
// Logic ops are commutative, so check each operand for a match.
SDValue X, Y;
const APInt *C0Val;
if (matchFirstShift(LogicOp.getOperand(0), X, C0Val))
Y = LogicOp.getOperand(1);
else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val))
Y = LogicOp.getOperand(0);
return SDValue();
// shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
SDLoc DL(Shift);
EVT VT = Shift->getValueType(0);
EVT ShiftAmtVT = Shift->getOperand(1).getValueType();
SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT);
SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC);
SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1);
return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2);
/// Handle transforms common to the three shifts, when the shift amount is a
/// constant.
/// We are looking for: (shift being one of shl/sra/srl)
/// shift (binop X, C0), C1
/// And want to transform into:
/// binop (shift X, C1), (shift C0, C1)
SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand");
// Do not turn a 'not' into a regular xor.
if (isBitwiseNot(N->getOperand(0)))
return SDValue();
// The inner binop must be one-use, since we want to replace it.
SDValue LHS = N->getOperand(0);
if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
return SDValue();
// TODO: This is limited to early combining because it may reveal regressions
// otherwise. But since we just checked a target hook to see if this is
// desirable, that should have filtered out cases where this interferes
// with some other pattern matching.
if (!LegalTypes)
if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
return R;
// We want to pull some binops through shifts, so that we have (and (shift))
// instead of (shift (and)), likewise for add, or, xor, etc. This sort of
// thing happens with address calculations, so it's important to canonicalize
// it.
switch (LHS.getOpcode()) {
return SDValue();
case ISD::OR:
case ISD::XOR:
case ISD::AND:
case ISD::ADD:
if (N->getOpcode() != ISD::SHL)
return SDValue(); // only shl(add) not sr[al](add).
// We require the RHS of the binop to be a constant and not opaque as well.
ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1));
if (!BinOpCst)
return SDValue();
// FIXME: disable this unless the input to the binop is a shift by a constant
// or is copy/select. Enable this in other cases when figure out it's exactly
// profitable.
SDValue BinOpLHSVal = LHS.getOperand(0);
bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL ||
BinOpLHSVal.getOpcode() == ISD::SRA ||
BinOpLHSVal.getOpcode() == ISD::SRL) &&
bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg ||
BinOpLHSVal.getOpcode() == ISD::SELECT;
if (!IsShiftByConstant && !IsCopyOrSelect)
return SDValue();
if (IsCopyOrSelect && N->hasOneUse())
return SDValue();
// Fold the constants, shifting the binop RHS by the shift amount.
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1),
assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");
SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0),
return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS);
SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
assert(N->getOpcode() == ISD::TRUNCATE);
assert(N->getOperand(0).getOpcode() == ISD::AND);
// (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
EVT TruncVT = N->getValueType(0);
if (N->hasOneUse() && N->getOperand(0).hasOneUse() &&
TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) {
SDValue N01 = N->getOperand(0).getOperand(1);
if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) {
SDLoc DL(N);
SDValue N00 = N->getOperand(0).getOperand(0);
SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00);
SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01);
return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01);
return SDValue();
SDValue DAGCombiner::visitRotate(SDNode *N) {
SDLoc dl(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
unsigned Bitsize = VT.getScalarSizeInBits();
// fold (rot x, 0) -> x
if (isNullOrNullSplat(N1))
return N0;
// fold (rot x, c) -> x iff (c % BitSize) == 0
if (isPowerOf2_32(Bitsize) && Bitsize > 1) {
APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1);
if (DAG.MaskedValueIsZero(N1, ModuloMask))
return N0;
// fold (rot x, c) -> (rot x, c % BitSize)
bool OutOfRange = false;
auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) {
OutOfRange |= C->getAPIntValue().uge(Bitsize);
return true;
if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) {
EVT AmtVT = N1.getValueType();
SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT);
if (SDValue Amt =
DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits}))
return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt);
// rot i16 X, 8 --> bswap X
auto *RotAmtC = isConstOrConstSplat(N1);
if (RotAmtC && RotAmtC->getAPIntValue() == 8 &&
VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT))
return DAG.getNode(ISD::BSWAP, dl, VT, N0);
// Simplify the operands using demanded-bits information.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
// fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
if (N1.getOpcode() == ISD::TRUNCATE &&
N1.getOperand(0).getOpcode() == ISD::AND) {
if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
unsigned NextOp = N0.getOpcode();
// fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
EVT ShiftVT = C1->getValueType(0);
bool SameSide = (N->getOpcode() == NextOp);
unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
if (SDValue CombinedShift = DAG.FoldConstantArithmetic(
CombineOp, dl, ShiftVT, {N1, N0.getOperand(1)})) {
SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
ISD::SREM, dl, ShiftVT, {CombinedShift, BitsizeC});
return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
return SDValue();
SDValue DAGCombiner::visitSHL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
if (SDValue V = DAG.simplifyShift(N0, N1))
return V;
EVT VT = N0.getValueType();
EVT ShiftVT = N1.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
// fold vector ops
if (VT.isVector()) {
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
// If setcc produces all-one true value then:
// (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
if (N1CV && N1CV->isConstant()) {
if (N0.getOpcode() == ISD::AND) {
SDValue N00 = N0->getOperand(0);
SDValue N01 = N0->getOperand(1);
BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);
if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
TargetLowering::ZeroOrNegativeOneBooleanContent) {
if (SDValue C =
DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1}))
return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
ConstantSDNode *N1C = isConstOrConstSplat(N1);
// fold (shl c1, c2) -> c1<<c2
if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
return C;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// if (shl x, c) is known to be zero, return 0
if (DAG.MaskedValueIsZero(SDValue(N, 0),
return DAG.getConstant(0, SDLoc(N), VT);
// fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
if (N1.getOpcode() == ISD::TRUNCATE &&
N1.getOperand(0).getOpcode() == ISD::AND) {
if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
// fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
if (N0.getOpcode() == ISD::SHL) {
auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
return (c1 + c2).uge(OpSizeInBits);
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
return DAG.getConstant(0, SDLoc(N), VT);
auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
return (c1 + c2).ult(OpSizeInBits);
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
SDLoc DL(N);
SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
// fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
// For this to be valid, the second form must not preserve any of the bits
// that are shifted out by the inner shift in the first form. This means
// the outer shift size must be >= the number of bits added by the ext.
// As a corollary, we don't care what kind of ext it is.
if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
N0.getOpcode() == ISD::ANY_EXTEND ||
N0.getOpcode() == ISD::SIGN_EXTEND) &&
N0.getOperand(0).getOpcode() == ISD::SHL) {
SDValue N0Op0 = N0.getOperand(0);
SDValue InnerShiftAmt = N0Op0.getOperand(1);
EVT InnerVT = N0Op0.getValueType();
uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits();
auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
return c2.uge(OpSizeInBits - InnerBitwidth) &&
(c1 + c2).uge(OpSizeInBits);
if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange,
/*AllowUndefs*/ false,
/*AllowTypeMismatch*/ true))
return DAG.getConstant(0, SDLoc(N), VT);
auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
return c2.uge(OpSizeInBits - InnerBitwidth) &&
(c1 + c2).ult(OpSizeInBits);
if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange,
/*AllowUndefs*/ false,
/*AllowTypeMismatch*/ true)) {
SDLoc DL(N);
SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0));
SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT);
Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1);
return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum);
// fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
// Only fold this if the inner zext has no other uses to avoid increasing
// the total number of instructions.
if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
N0.getOperand(0).getOpcode() == ISD::SRL) {
SDValue N0Op0 = N0.getOperand(0);
SDValue InnerShiftAmt = N0Op0.getOperand(1);
auto MatchEqual = [VT](ConstantSDNode *LHS, ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2);
return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2);
if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual,
/*AllowUndefs*/ false,
/*AllowTypeMismatch*/ true)) {
SDLoc DL(N);
EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType();
SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT);
NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL);
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
// fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
// fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2
// TODO - support non-uniform vector shift amounts.
if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) &&
N0->getFlags().hasExact()) {
if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
uint64_t C1 = N0C1->getZExtValue();
uint64_t C2 = N1C->getZExtValue();
SDLoc DL(N);
if (C1 <= C2)
return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
DAG.getConstant(C2 - C1, DL, ShiftVT));
return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0),
DAG.getConstant(C1 - C2, DL, ShiftVT));
// fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
// (and (srl x, (sub c1, c2), MASK)
// Only fold this if the inner shift has no other uses -- if it does, folding
// this will increase the total number of instructions.
// TODO - drop hasOneUse requirement if c1 == c2?
// TODO - support non-uniform vector shift amounts.
if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
if (N0C1->getAPIntValue().ult(OpSizeInBits)) {
uint64_t c1 = N0C1->getZExtValue();
uint64_t c2 = N1C->getZExtValue();
APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
SDValue Shift;
if (c2 > c1) {
Mask <<= c2 - c1;
SDLoc DL(N);
Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
DAG.getConstant(c2 - c1, DL, ShiftVT));
} else {
Mask.lshrInPlace(c1 - c2);
SDLoc DL(N);
Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
DAG.getConstant(c1 - c2, DL, ShiftVT));
SDLoc DL(N0);
return DAG.getNode(ISD::AND, DL, VT, Shift,
DAG.getConstant(Mask, DL, VT));
// fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
isConstantOrConstantVector(N1, /* No Opaques */ true)) {
SDLoc DL(N);
SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
// fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
// Variant of version done on multiply, except mul by a power of 2 is turned
// into a shift.
if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
N0.getNode()->hasOneUse() &&
isConstantOrConstantVector(N1, /* No Opaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
TLI.isDesirableToCommuteWithShift(N, Level)) {
SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
isConstantOrConstantVector(N1, /* No Opaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
if (isConstantOrConstantVector(Shl))
return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
if (N1C && !N1C->isOpaque())
if (SDValue NewSHL = visitShiftByConstant(N))
return NewSHL;
// Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).
if (N0.getOpcode() == ISD::VSCALE)
if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) {
auto DL = SDLoc(N);
APInt C0 = N0.getConstantOperandAPInt(0);
APInt C1 = NC1->getAPIntValue();
return DAG.getVScale(DL, VT, C0 << C1);
return SDValue();
// Transform a right shift of a multiply into a multiply-high.
// Examples:
// (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b)
// (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b)
static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI) {
assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
"SRL or SRA node is required here!");
// Check the shift amount. Proceed with the transformation if the shift
// amount is constant.
ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1));
if (!ShiftAmtSrc)
return SDValue();
SDLoc DL(N);
// The operation feeding into the shift must be a multiply.
SDValue ShiftOperand = N->getOperand(0);
if (ShiftOperand.getOpcode() != ISD::MUL)
return SDValue();
// Both operands must be equivalent extend nodes.
SDValue LeftOp = ShiftOperand.getOperand(0);
SDValue RightOp = ShiftOperand.getOperand(1);
bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
if ((!(IsSignExt || IsZeroExt)) || LeftOp.getOpcode() != RightOp.getOpcode())
return SDValue();
EVT WideVT1 = LeftOp.getValueType();
EVT WideVT2 = RightOp.getValueType();
// Proceed with the transformation if the wide types match.
assert((WideVT1 == WideVT2) &&
"Cannot have a multiply node with two different operand types.");
EVT NarrowVT = LeftOp.getOperand(0).getValueType();
// Check that the two extend nodes are the same type.
if (NarrowVT != RightOp.getOperand(0).getValueType())
return SDValue();
// Only transform into mulh if mulh for the narrow type is cheaper than
// a multiply followed by a shift. This should also check if mulh is
// legal for NarrowVT on the target.
if (!TLI.isMulhCheaperThanMulShift(NarrowVT))
return SDValue();
// Proceed with the transformation if the wide type is twice as large
// as the narrow type.
unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
if (WideVT1.getScalarSizeInBits() != 2 * NarrowVTSize)
return SDValue();
// Check the shift amount with the narrow type size.
// Proceed with the transformation if the shift amount is the width
// of the narrow type.
unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();
if (ShiftAmt != NarrowVTSize)
return SDValue();
// If the operation feeding into the MUL is a sign extend (sext),
// we use mulhs. Othewise, zero extends (zext) use mulhu.
unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;
SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0),
return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1)
: DAG.getZExtOrTrunc(Result, DL, WideVT1));
SDValue DAGCombiner::visitSRA(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
if (SDValue V = DAG.simplifyShift(N0, N1))
return V;
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
// Arithmetic shifting an all-sign-bit value is a no-op.
// fold (sra 0, x) -> 0
// fold (sra -1, x) -> -1
if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
return N0;
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
ConstantSDNode *N1C = isConstOrConstSplat(N1);
// fold (sra c1, c2) -> (sra c1, c2)
if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
return C;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
// sext_inreg.
if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
if (VT.isVector())
ExtVT = EVT::getVectorVT(*DAG.getContext(),
ExtVT, VT.getVectorNumElements());
if (!LegalOperations ||
TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) ==
return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
N0.getOperand(0), DAG.getValueType(ExtVT));
// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
// clamp (add c1, c2) to max shift.
if (N0.getOpcode() == ISD::SRA) {
SDLoc DL(N);
EVT ShiftVT = N1.getValueType();
EVT ShiftSVT = ShiftVT.getScalarType();
SmallVector<SDValue, 16> ShiftValues;
auto SumOfShifts = [&](ConstantSDNode *LHS, ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
APInt Sum = c1 + c2;
unsigned ShiftSum =
Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue();
ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT));
return true;
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) {
SDValue ShiftValue;
if (VT.isVector())
ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues);
ShiftValue = ShiftValues[0];
return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue);
// fold (sra (shl X, m), (sub result_size, n))
// -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
// result_size - n != m.
// If truncate is free for the target sext(shl) is likely to result in better
// code.
if (N0.getOpcode() == ISD::SHL && N1C) {
// Get the two constanst of the shifts, CN0 = m, CN = n.
const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
if (N01C) {
LLVMContext &Ctx = *DAG.getContext();
// Determine what the truncate's result bitsize and type would be.
EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());
if (VT.isVector())
TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());
// Determine the residual right-shift amount.
int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
// If the shift is not a no-op (in which case this should be just a sign
// extend already), the truncated to type is legal, sign_extend is legal
// on that type, and the truncate to that type is both legal and free,
// perform the transform.
if ((ShiftAmt > 0) &&
TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
TLI.isTruncateFree(VT, TruncVT)) {
SDLoc DL(N);
SDValue Amt = DAG.getConstant(ShiftAmt, DL,
SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
N0.getOperand(0), Amt);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
return DAG.getNode(ISD::SIGN_EXTEND, DL,
N->getValueType(0), Trunc);
// We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.
// sra (add (shl X, N1C), AddC), N1C -->
// sext (add (trunc X to (width - N1C)), AddC')
if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C &&
N0.getOperand(0).getOpcode() == ISD::SHL &&
N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) {
if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) {
SDValue Shl = N0.getOperand(0);
// Determine what the truncate's type would be and ask the target if that
// is a free operation.
LLVMContext &Ctx = *DAG.getContext();
unsigned ShiftAmt = N1C->getZExtValue();
EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt);
if (VT.isVector())
TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());
// TODO: The simple type check probably belongs in the default hook
// implementation and/or target-specific overrides (because
// non-simple types likely require masking when legalized), but that
// restriction may conflict with other transforms.
if (TruncVT.isSimple() && isTypeLegal(TruncVT) &&
TLI.isTruncateFree(VT, TruncVT)) {
SDLoc DL(N);
SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).
trunc(TruncVT.getScalarSizeInBits()), DL, TruncVT);
SDValue Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC);
return DAG.getSExtOrTrunc(Add, DL, VT);
// fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
if (N1.getOpcode() == ISD::TRUNCATE &&
N1.getOperand(0).getOpcode() == ISD::AND) {
if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
// fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
// fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
// if c1 is equal to the number of bits the trunc removes
// TODO - support non-uniform vector shift amounts.
if (N0.getOpcode() == ISD::TRUNCATE &&
(N0.getOperand(0).getOpcode() == ISD::SRL ||
N0.getOperand(0).getOpcode() == ISD::SRA) &&
N0.getOperand(0).hasOneUse() &&
N0.getOperand(0).getOperand(1).hasOneUse() && N1C) {
SDValue N0Op0 = N0.getOperand(0);
if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
EVT LargeVT = N0Op0.getValueType();
unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits;
if (LargeShift->getAPIntValue() == TruncBits) {
SDLoc DL(N);
SDValue Amt = DAG.getConstant(N1C->getZExtValue() + TruncBits, DL,
SDValue SRA =
DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt);
return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
// Simplify, based on bits shifted out of the LHS.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
// If the sign bit is known to be zero, switch this to a SRL.
if (DAG.SignBitIsZero(N0))
return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
if (N1C && !N1C->isOpaque())
if (SDValue NewSRA = visitShiftByConstant(N))
return NewSRA;
// Try to transform this shift into a multiply-high if
// it matches the appropriate pattern detected in combineShiftToMULH.
if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
return MULH;
return SDValue();
SDValue DAGCombiner::visitSRL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
if (SDValue V = DAG.simplifyShift(N0, N1))
return V;
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
ConstantSDNode *N1C = isConstOrConstSplat(N1);
// fold (srl c1, c2) -> c1 >>u c2
if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
return C;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// if (srl x, c) is known to be zero, return 0
if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
return DAG.getConstant(0, SDLoc(N), VT);
// fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
if (N0.getOpcode() == ISD::SRL) {
auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
return (c1 + c2).uge(OpSizeInBits);
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
return DAG.getConstant(0, SDLoc(N), VT);
auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
return (c1 + c2).ult(OpSizeInBits);
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
SDLoc DL(N);
EVT ShiftVT = N1.getValueType();
SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
N0.getOperand(0).getOpcode() == ISD::SRL) {
SDValue InnerShift = N0.getOperand(0);
// TODO - support non-uniform vector shift amounts.
if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) {
uint64_t c1 = N001C->getZExtValue();
uint64_t c2 = N1C->getZExtValue();
EVT InnerShiftVT = InnerShift.getValueType();
EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType();
uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
// srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2)))
// This is only valid if the OpSizeInBits + c1 = size of inner shift.
if (c1 + OpSizeInBits == InnerShiftSize) {
SDLoc DL(N);
if (c1 + c2 >= InnerShiftSize)
return DAG.getConstant(0, DL, VT);
SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
InnerShift.getOperand(0), NewShiftAmt);
return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift);
// In the more general case, we can clear the high bits after the shift:
// srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask)
if (N0.hasOneUse() && InnerShift.hasOneUse() &&
c1 + c2 < InnerShiftSize) {
SDLoc DL(N);
SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
InnerShift.getOperand(0), NewShiftAmt);
SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize,
OpSizeInBits - c2),
DL, InnerShiftVT);
SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask);
return DAG.getNode(ISD::TRUNCATE, DL, VT, And);
// fold (srl (shl x, c), c) -> (and x, cst2)
// TODO - (srl (shl x, c1), c2).
if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
SDLoc DL(N);
SDValue Mask =
DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
// fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
// TODO - support non-uniform vector shift amounts.
if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
// Shifting in all undef bits?
EVT SmallVT = N0.getOperand(0).getValueType();
unsigned BitSize = SmallVT.getScalarSizeInBits();
if (N1C->getAPIntValue().uge(BitSize))
return DAG.getUNDEF(VT);
if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
uint64_t ShiftAmt = N1C->getZExtValue();
SDLoc DL0(N0);
SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
DAG.getConstant(ShiftAmt, DL0,
APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
SDLoc DL(N);
return DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
DAG.getConstant(Mask, DL, VT));
// fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign
// bit, which is unmodified by sra.
if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) {
if (N0.getOpcode() == ISD::SRA)
return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
// fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit).
if (N1C && N0.getOpcode() == ISD::CTLZ &&
N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
KnownBits Known = DAG.computeKnownBits(N0.getOperand(0));
// If any of the input bits are KnownOne, then the input couldn't be all
// zeros, thus the result of the srl will always be zero.
if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);
// If all of the bits input the to ctlz node are known to be zero, then
// the result of the ctlz is "32" and the result of the shift is one.
APInt UnknownBits = ~Known.Zero;
if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);
// Otherwise, check to see if there is exactly one bit input to the ctlz.
if (UnknownBits.isPowerOf2()) {
// Okay, we know that only that the single bit specified by UnknownBits
// could be set on input to the CTLZ node. If this bit is set, the SRL
// will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
// to an SRL/XOR pair, which is likely to simplify more.
unsigned ShAmt = UnknownBits.countTrailingZeros();
SDValue Op = N0.getOperand(0);
if (ShAmt) {
SDLoc DL(N0);
Op = DAG.getNode(ISD::SRL, DL, VT, Op,
DAG.getConstant(ShAmt, DL,
SDLoc DL(N);
return DAG.getNode(ISD::XOR, DL, VT,
Op, DAG.getConstant(1, DL, VT));
// fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
if (N1.getOpcode() == ISD::TRUNCATE &&
N1.getOperand(0).getOpcode() == ISD::AND) {
if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
// fold operands of srl based on knowledge that the low bits are not
// demanded.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
if (N1C && !N1C->isOpaque())
if (SDValue NewSRL = visitShiftByConstant(N))
return NewSRL;
// Attempt to convert a srl of a load into a narrower zero-extending load.
if (SDValue NarrowLoad = ReduceLoadWidth(N))
return NarrowLoad;
// Here is a common situation. We want to optimize:
// %a = ...
// %b = and i32 %a, 2
// %c = srl i32 %b, 1
// brcond i32 %c ...
// into
// %a = ...
// %b = and %a, 2
// %c = setcc eq %b, 0
// brcond %c ...
// However when after the source operand of SRL is optimized into AND, the SRL
// itself may not be optimized further. Look for it and add the BRCOND into
// the worklist.
if (N->hasOneUse()) {
SDNode *Use = *N->use_begin();
if (Use->getOpcode() == ISD::BRCOND)
else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
// Also look pass the truncate.
Use = *Use->use_begin();
if (Use->getOpcode() == ISD::BRCOND)
// Try to transform this shift into a multiply-high if
// it matches the appropriate pattern detected in combineShiftToMULH.
if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
return MULH;
return SDValue();
SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
bool IsFSHL = N->getOpcode() == ISD::FSHL;
unsigned BitWidth = VT.getScalarSizeInBits();
// fold (fshl N0, N1, 0) -> N0
// fold (fshr N0, N1, 0) -> N1
if (isPowerOf2_32(BitWidth))
if (DAG.MaskedValueIsZero(
N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1)))
return IsFSHL ? N0 : N1;
auto IsUndefOrZero = [](SDValue V) {
return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
// TODO - support non-uniform vector shift amounts.
if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
EVT ShAmtTy = N2.getValueType();
// fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
if (Cst->getAPIntValue().uge(BitWidth)) {
uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy));
unsigned ShAmt = Cst->getZExtValue();
if (ShAmt == 0)
return IsFSHL ? N0 : N1;
// fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C)
// fold fshr(undef_or_zero, N1, C) -> lshr(N1, C)
// fold fshl(N0, undef_or_zero, C) -> shl(N0, C)
// fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C)
if (IsUndefOrZero(N0))
return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1,
DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt,
SDLoc(N), ShAmtTy));
if (IsUndefOrZero(N1))
return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
SDLoc(N), ShAmtTy));
// fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
// fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
// TODO - bigendian support once we have test coverage.
// TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
// TODO - permit LHS EXTLOAD if extensions are shifted out.
if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
!DAG.getDataLayout().isBigEndian()) {
auto *LHS = dyn_cast<LoadSDNode>(N0);
auto *RHS = dyn_cast<LoadSDNode>(N1);
if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
LHS->getAddressSpace() == RHS->getAddressSpace() &&
(LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
ISD::isNON_EXTLoad(LHS)) {
if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
uint64_t PtrOff =
IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff);
bool Fast = false;
if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
RHS->getAddressSpace(), NewAlign,
RHS->getMemOperand()->getFlags(), &Fast) &&
Fast) {
SDValue NewPtr =
DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL);
SDValue Load = DAG.getLoad(
VT, DL, RHS->getChain(), NewPtr,
RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
// Replace the old load's chain with the new load's chain.
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
return Load;
// fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)
// fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2)
// iff We know the shift amount is in range.
// TODO: when is it worth doing SUB(BW, N2) as well?
if (isPowerOf2_32(BitWidth)) {
APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1);
if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2);
if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2);
// fold (fshl N0, N0, N2) -> (rotl N0, N2)
// fold (fshr N0, N0, N2) -> (rotr N0, N2)
// TODO: Investigate flipping this rotate if only one is legal, if funnel shift
// is legal as well we might be better off avoiding non-constant (BW - N2).
unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
if (N0 == N1 && hasOperation(RotOpc, VT))
return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
// Simplify, based on bits shifted out of N0/N1.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
return SDValue();
SDValue DAGCombiner::visitABS(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (abs c1) -> c2
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
// fold (abs (abs x)) -> (abs x)
if (N0.getOpcode() == ISD::ABS)
return N0;
// fold (abs x) -> x iff not-negative
if (DAG.SignBitIsZero(N0))
return N0;
return SDValue();
SDValue DAGCombiner::visitBSWAP(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (bswap c1) -> c2
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
// fold (bswap (bswap x)) -> x
if (N0.getOpcode() == ISD::BSWAP)
return N0->getOperand(0);
return SDValue();
SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (bitreverse c1) -> c2
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
// fold (bitreverse (bitreverse x)) -> x
if (N0.getOpcode() == ISD::BITREVERSE)
return N0.getOperand(0);
return SDValue();
SDValue DAGCombiner::visitCTLZ(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (ctlz c1) -> c2
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
// If the value is known never to be zero, switch to the undef version.
if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) {
if (DAG.isKnownNeverZero(N0))
return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
return SDValue();
SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (ctlz_zero_undef c1) -> c2
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
return SDValue();
SDValue DAGCombiner::visitCTTZ(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (cttz c1) -> c2
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
// If the value is known never to be zero, switch to the undef version.
if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) {
if (DAG.isKnownNeverZero(N0))
return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
return SDValue();
SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (cttz_zero_undef c1) -> c2
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
return SDValue();
SDValue DAGCombiner::visitCTPOP(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (ctpop c1) -> c2
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
return SDValue();
// FIXME: This should be checking for no signed zeros on individual operands, as
// well as no nans.
static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
SDValue RHS,
const TargetLowering &TLI) {
const TargetOptions &Options = DAG.getTarget().Options;
EVT VT = LHS.getValueType();
return Options.NoSignedZerosFPMath && VT.isFloatingPoint() &&
TLI.isProfitableToCombineMinNumMaxNum(VT) &&
DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS);
/// Generate Min/Max node
static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
SDValue RHS, SDValue True, SDValue False,
ISD::CondCode CC, const TargetLowering &TLI,
SelectionDAG &DAG) {
if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
return SDValue();
EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
switch (CC) {
case ISD::SETLT:
case ISD::SETLE:
case ISD::SETULE: {
// Since it's known never nan to get here already, either fminnum or
// fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
// expanded in terms of it.
unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
return SDValue();
case ISD::SETGT:
case ISD::SETGE:
case ISD::SETUGE: {
unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
return DAG.getNode(Opcode, DL, VT, LHS, RHS);
return SDValue();
return SDValue();
/// If a (v)select has a condition value that is a sign-bit test, try to smear
/// the condition operand sign-bit across the value width and use it as a mask.
static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
SDValue Cond = N->getOperand(0);
SDValue C1 = N->getOperand(1);
SDValue C2 = N->getOperand(2);
assert(isConstantOrConstantVector(C1) && isConstantOrConstantVector(C2) &&
"Expected select-of-constants");
EVT VT = N->getValueType(0);
if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() ||
VT != Cond.getOperand(0).getValueType())
return SDValue();
// The inverted-condition + commuted-select variants of these patterns are
// canonicalized to these forms in IR.
SDValue X = Cond.getOperand(0);
SDValue CondC = Cond.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) &&
isAllOnesOrAllOnesSplat(C2)) {
// i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1
SDLoc DL(N);
SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
return DAG.getNode(ISD::OR, DL, VT, Sra, C1);
if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) {
// i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1
SDLoc DL(N);
SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
return DAG.getNode(ISD::AND, DL, VT, Sra, C1);
return SDValue();
SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
SDValue Cond = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
EVT VT = N->getValueType(0);
EVT CondVT = Cond.getValueType();
SDLoc DL(N);
if (!VT.isInteger())
return SDValue();
auto *C1 = dyn_cast<ConstantSDNode>(N1);
auto *C2 = dyn_cast<ConstantSDNode>(N2);
if (!C1 || !C2)
return SDValue();
// Only do this before legalization to avoid conflicting with target-specific
// transforms in the other direction (create a select from a zext/sext). There
// is also a target-independent combine here in DAGCombiner in the other
// direction for (select Cond, -1, 0) when the condition is not i1.
if (CondVT == MVT::i1 && !LegalOperations) {
if (C1->isNullValue() && C2->isOne()) {
// select Cond, 0, 1 --> zext (!Cond)
SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
if (VT != MVT::i1)
NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
return NotCond;
if (C1->isNullValue() && C2->isAllOnesValue()) {
// select Cond, 0, -1 --> sext (!Cond)
SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
if (VT != MVT::i1)
NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
return NotCond;
if (C1->isOne() && C2->isNullValue()) {
// select Cond, 1, 0 --> zext (Cond)
if (VT != MVT::i1)
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
return Cond;
if (C1->isAllOnesValue() && C2->isNullValue()) {
// select Cond, -1, 0 --> sext (Cond)
if (VT != MVT::i1)
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
return Cond;
// Use a target hook because some targets may prefer to transform in the
// other direction.
if (TLI.convertSelectOfConstantsToMath(VT)) {
// For any constants that differ by 1, we can transform the select into an
// extend and add.
const APInt &C1Val = C1->getAPIntValue();
const APInt &C2Val = C2->getAPIntValue();
if (C1Val - 1 == C2Val) {
// select Cond, C1, C1-1 --> add (zext Cond), C1-1
if (VT != MVT::i1)
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
if (C1Val + 1 == C2Val) {
// select Cond, C1, C1+1 --> add (sext Cond), C1+1
if (VT != MVT::i1)
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
// select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
if (C1Val.isPowerOf2() && C2Val.isNullValue()) {
if (VT != MVT::i1)
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT);
return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC);
if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
return V;
return SDValue();
// fold (select Cond, 0, 1) -> (xor Cond, 1)
// We can't do this reliably if integer based booleans have different contents
// to floating point based booleans. This is because we can't tell whether we
// have an integer-based boolean or a floating-point-based boolean unless we
// can find the SETCC that produced it and inspect its operands. This is
// fairly easy if C is the SETCC node, but it can potentially be
// undiscoverable (or not reasonably discoverable). For example, it could be
// in another basic block or it could require searching a complicated
// expression.
if (CondVT.isInteger() &&
TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) ==
TargetLowering::ZeroOrOneBooleanContent &&
TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) ==
TargetLowering::ZeroOrOneBooleanContent &&
C1->isNullValue() && C2->isOne()) {
SDValue NotCond =
DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
if (VT.bitsEq(CondVT))
return NotCond;
return DAG.getZExtOrTrunc(NotCond, DL, VT);
return SDValue();
SDValue DAGCombiner::visitSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
EVT VT = N->getValueType(0);
EVT VT0 = N0.getValueType();
SDLoc DL(N);
SDNodeFlags Flags = N->getFlags();
if (SDValue V = DAG.simplifySelect(N0, N1, N2))
return V;
// fold (select X, X, Y) -> (or X, Y)
// fold (select X, 1, Y) -> (or C, Y)
if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1)))
return DAG.getNode(ISD::OR, DL, VT, N0, N2);
if (SDValue V = foldSelectOfConstants(N))
return V;
// fold (select C, 0, X) -> (and (not C), X)
if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) {
SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2);
// fold (select C, X, 1) -> (or (not C), X)
if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) {
SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1);
// fold (select X, Y, X) -> (and X, Y)
// fold (select X, Y, 0) -> (and X, Y)
if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2)))
return DAG.getNode(ISD::AND, DL, VT, N0, N1);
// If we can fold this based on the true/false value, do so.
if (SimplifySelectOps(N, N1, N2))
return SDValue(N, 0); // Don't revisit N.
if (VT0 == MVT::i1) {
// The code in this block deals with the following 2 equivalences:
// select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
// select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
// The target can specify its preferred form with the
// shouldNormalizeToSelectSequence() callback. However we always transform
// to the right anyway if we find the inner select exists in the DAG anyway
// and we always transform to the left side if we know that we can further
// optimize the combination of the conditions.
bool normalizeToSequence =
TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
// select (and Cond0, Cond1), X, Y
// -> select Cond0, (select Cond1, X, Y), Y
if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
SDValue Cond0 = N0->getOperand(0);
SDValue Cond1 = N0->getOperand(1);
SDValue InnerSelect =
DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags);
if (normalizeToSequence || !InnerSelect.use_empty())
return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
InnerSelect, N2, Flags);
// Cleanup on failure.
if (InnerSelect.use_empty())
// select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
SDValue Cond0 = N0->getOperand(0);
SDValue Cond1 = N0->getOperand(1);
SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(),
Cond1, N1, N2, Flags);
if (normalizeToSequence || !InnerSelect.use_empty())
return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
InnerSelect, Flags);
// Cleanup on failure.
if (InnerSelect.use_empty())
// select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
SDValue N1_0 = N1->getOperand(0);
SDValue N1_1 = N1->getOperand(1);
SDValue N1_2 = N1->getOperand(2);
if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
// Create the actual and node if we can generate good code for it.
if (!normalizeToSequence) {
SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1,
N2, Flags);
// Otherwise see if we can optimize the "and" to a better pattern.
if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
N2, Flags);
// select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
SDValue N2_0 = N2->getOperand(0);
SDValue N2_1 = N2->getOperand(1);
SDValue N2_2 = N2->getOperand(2);
if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
// Create the actual or node if we can generate good code for it.
if (!normalizeToSequence) {
SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1,
N2_2, Flags);
// Otherwise see if we can optimize to a better pattern.
if (SDValue Combined = visitORLike(N0, N2_0, N))
return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
N2_2, Flags);
// select (not Cond), N1, N2 -> select Cond, N2, N1
if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1);
return SelectOp;
// Fold selects based on a setcc into other things, such as min/max/abs.
if (N0.getOpcode() == ISD::SETCC) {
SDValue Cond0 = N0.getOperand(0), Cond1 = N0.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
// select (fcmp lt x, y), x, y -> fminnum x, y
// select (fcmp gt x, y), x, y -> fmaxnum x, y
// This is OK if we don't care what happens if either operand is a NaN.
if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI))
if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2,
return FMinMax;
// Use 'unsigned add with overflow' to optimize an unsigned saturating add.
// This is conservatively limited to pre-legal-operations to give targets
// a chance to reverse the transform if they want to do that. Also, it is
// unlikely that the pattern would be formed late, so it's probably not
// worth going through the other checks.
if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) &&
CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) &&
N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) {
auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1));
auto *NotC = dyn_cast<ConstantSDNode>(Cond1);
if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) {
// select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) -->
// uaddo Cond0, C; select uaddo.1, -1, uaddo.0
// The IR equivalent of this transform would have this form:
// %a = add %x, C
// %c = icmp ugt %x, ~C
// %r = select %c, -1, %a
// =>
// %u = call {iN,i1} llvm.uadd.with.overflow(%x, C)
// %u0 = extractvalue %u, 0
// %u1 = extractvalue %u, 1
// %r = select %u1, -1, %u0
SDVTList VTs = DAG.getVTList(VT, VT0);
SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1));
return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0));
if (TLI.isOperationLegal(ISD::SELECT_CC, VT) ||
(!LegalOperations &&
TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
// Any flags available in a select/setcc fold will be on the setcc as they
// migrated from fcmp
Flags = N0.getNode()->getFlags();
SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
N2, N0.getOperand(2));
return SelectNode;
return SimplifySelect(DL, N0, N1, N2);
return SDValue();
// This function assumes all the vselect's arguments are CONCAT_VECTOR
// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
EVT VT = N->getValueType(0);
int NumElems = VT.getVectorNumElements();
assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
RHS.getOpcode() == ISD::CONCAT_VECTORS &&
Cond.getOpcode() == ISD::BUILD_VECTOR);
// CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
// binary ones here.
if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2)
return SDValue();
// We're sure we have an even number of elements due to the
// concat_vectors we have as arguments to vselect.
// Skip BV elements until we find one that's not an UNDEF
// After we find an UNDEF element, keep looping until we get to half the
// length of the BV and see if all the non-undef nodes are the same.
ConstantSDNode *BottomHalf = nullptr;
for (int i = 0; i < NumElems / 2; ++i) {
if (Cond->getOperand(i)->isUndef())
if (BottomHalf == nullptr)
BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
else if (Cond->getOperand(i).getNode() != BottomHalf)
return SDValue();
// Do the same for the second half of the BuildVector
ConstantSDNode *TopHalf = nullptr;
for (int i = NumElems / 2; i < NumElems; ++i) {
if (Cond->getOperand(i)->isUndef())
if (TopHalf == nullptr)
TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
else if (Cond->getOperand(i).getNode() != TopHalf)
return SDValue();
assert(TopHalf && BottomHalf &&
"One half of the selector was all UNDEFs and the other was all the "
"same value. This should have been addressed before this function.");
return DAG.getNode(
BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
SDValue Mask = MSC->getMask();
SDValue Chain = MSC->getChain();
SDLoc DL(N);
// Zap scatters with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return Chain;
return SDValue();
SDValue DAGCombiner::visitMSTORE(SDNode *N) {
MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
SDValue Mask = MST->getMask();
SDValue Chain = MST->getChain();
SDLoc DL(N);
// Zap masked stores with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return Chain;
// Try transforming N to an indexed store.
if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
return SDValue(N, 0);
return SDValue();
SDValue DAGCombiner::visitMGATHER(SDNode *N) {
MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
SDValue Mask = MGT->getMask();
SDLoc DL(N);
// Zap gathers with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return CombineTo(N, MGT->getPassThru(), MGT->getChain());
return SDValue();
SDValue DAGCombiner::visitMLOAD(SDNode *N) {
MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
SDValue Mask = MLD->getMask();
SDLoc DL(N);
// Zap masked loads with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return CombineTo(N, MLD->getPassThru(), MLD->getChain());
// Try transforming N to an indexed load.
if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
return SDValue(N, 0);
return SDValue();
/// A vector select of 2 constant vectors can be simplified to math/logic to
/// avoid a variable select instruction and possibly avoid constant loads.
SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
SDValue Cond = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
EVT VT = N->getValueType(0);
if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 ||
!TLI.convertSelectOfConstantsToMath(VT) ||
!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) ||
return SDValue();
// Check if we can use the condition value to increment/decrement a single
// constant value. This simplifies a select to an add and removes a constant
// load/materialization from the general case.
bool AllAddOne = true;
bool AllSubOne = true;
unsigned Elts = VT.getVectorNumElements();
for (unsigned i = 0; i != Elts; ++i) {
SDValue N1Elt = N1.getOperand(i);
SDValue N2Elt = N2.getOperand(i);
if (N1Elt.isUndef() || N2Elt.isUndef())
if (N1Elt.getValueType() != N2Elt.getValueType())
const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
if (C1 != C2 + 1)
AllAddOne = false;
if (C1 != C2 - 1)
AllSubOne = false;
// Further simplifications for the extra-special cases where the constants are
// all 0 or all -1 should be implemented as folds of these patterns.
SDLoc DL(N);
if (AllAddOne || AllSubOne) {
// vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
// vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
// select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C)
APInt Pow2C;
if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() &&
isNullOrNullSplat(N2)) {
SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT);
SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT);
return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC);
if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
return V;
// The general case for select-of-constants:
// vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
// ...but that only makes sense if a vselect is slower than 2 logic ops, so
// leave that to a machine-specific pass.
return SDValue();
SDValue DAGCombiner::visitVSELECT(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
EVT VT = N->getValueType(0);
SDLoc DL(N);
if (SDValue V = DAG.simplifySelect(N0, N1, N2))
return V;
// vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
return DAG.getSelect(DL, VT, F, N2, N1);
// Canonicalize integer abs.
// vselect (setg[te] X, 0), X, -X ->
// vselect (setgt X, -1), X, -X ->
// vselect (setl[te] X, 0), -X, X ->
// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
if (N0.getOpcode() == ISD::SETCC) {
SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
bool isAbs = false;
bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
(ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) &&
N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
if (isAbs) {
if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
return DAG.getNode(ISD::ABS, DL, VT, LHS);
SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS,
DAG.getConstant(VT.getScalarSizeInBits() - 1,
DL, getShiftAmountTy(VT)));
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
// vselect x, y (fcmp lt x, y) -> fminnum x, y
// vselect x, y (fcmp gt x, y) -> fmaxnum x, y
// This is OK if we don't care about what happens if either operand is a
// NaN.
if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
if (SDValue FMinMax =
combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG))
return FMinMax;
// If this select has a condition (setcc) with narrower operands than the
// select, try to widen the compare to match the select width.
// TODO: This should be extended to handle any constant.
// TODO: This could be extended to handle non-loading patterns, but that
// requires thorough testing to avoid regressions.
if (isNullOrNullSplat(RHS)) {
EVT NarrowVT = LHS.getValueType();
EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
EVT SetCCVT = getSetCCResultType(LHS.getValueType());
unsigned SetCCWidth = SetCCVT.getScalarSizeInBits();
unsigned WideWidth = WideVT.getScalarSizeInBits();
bool IsSigned = isSignedIntSetCC(CC);
auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() &&
SetCCWidth != 1 && SetCCWidth < WideWidth &&
TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) &&
TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) {
// Both compare operands can be widened for free. The LHS can use an
// extended load, and the RHS is a constant:
// vselect (ext (setcc load(X), C)), N1, N2 -->
// vselect (setcc extload(X), C'), N1, N2
auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS);
SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS);
EVT WideSetCCVT = getSetCCResultType(WideVT);
SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC);
return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
if (SimplifySelectOps(N, N1, N2))
return SDValue(N, 0); // Don't revisit N.
// Fold (vselect (build_vector all_ones), N1, N2) -> N1
if (ISD::isBuildVectorAllOnes(N0.getNode()))
return N1;
// Fold (vselect (build_vector all_zeros), N1, N2) -> N2
if (ISD::isBuildVectorAllZeros(N0.getNode()))
return N2;
// The ConvertSelectToConcatVector function is assuming both the above
// checks for (vselect (build_vector all{ones,zeros) ...) have been made
// and addressed.
if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
N2.getOpcode() == ISD::CONCAT_VECTORS &&
ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
return CV;
if (SDValue V = foldVSelectOfConstants(N))
return V;
return SDValue();
SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
SDValue N3 = N->getOperand(3);
SDValue N4 = N->getOperand(4);
ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
// fold select_cc lhs, rhs, x, x, cc -> x
if (N2 == N3)
return N2;
// Determine if the condition we're dealing with is constant
if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
CC, SDLoc(N), false)) {
if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
if (!SCCC->isNullValue())
return N2; // cond always true -> true val
return N3; // cond always false -> false val
} else if (SCC->isUndef()) {
// When the condition is UNDEF, just return the first operand. This is
// coherent the DAG creation, no setcc node is created in this case
return N2;
} else if (SCC.getOpcode() == ISD::SETCC) {
// Fold to a simpler select_cc
SDValue SelectOp = DAG.getNode(
ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0),
SCC.getOperand(1), N2, N3, SCC.getOperand(2));
return SelectOp;
// If we can fold this based on the true/false value, do so.
if (SimplifySelectOps(N, N2, N3))
return SDValue(N, 0); // Don't revisit N.
// fold select_cc into other things, such as min/max/abs
return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
SDValue DAGCombiner::visitSETCC(SDNode *N) {
// setcc is very commonly used as an argument to brcond. This pattern
// also lend itself to numerous combines and, as a result, it is desired
// we keep the argument to a brcond as a setcc as much as possible.
bool PreferSetCC =
N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND;
SDValue Combined = SimplifySetCC(
N->getValueType(0), N->getOperand(0), N->getOperand(1),
cast<CondCodeSDNode>(N->getOperand(2))->get(), SDLoc(N), !PreferSetCC);
if (!Combined)
return SDValue();
// If we prefer to have a setcc, and we don't, we'll try our best to
// recreate one using rebuildSetCC.
if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
SDValue NewSetCC = rebuildSetCC(Combined);
// We don't have anything interesting to combine to.
if (NewSetCC.getNode() == N)
return SDValue();
if (NewSetCC)
return NewSetCC;
return Combined;
SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
SDValue Carry = N->getOperand(2);
SDValue Cond = N->getOperand(3);
// If Carry is false, fold to a regular SETCC.
if (isNullConstant(Carry))
return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
return SDValue();
/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
/// a build_vector of constants.
/// This function is called by the DAGCombiner when visiting sext/zext/aext
/// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
/// Vector extends are not folded if operations are legal; this is to
/// avoid introducing illegal build_vector dag nodes.
static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
SelectionDAG &DAG, bool LegalTypes) {
unsigned Opcode = N->getOpcode();
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
SDLoc DL(N);
assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
&& "Expected EXTEND dag node in input!");
// fold (sext c1) -> c1
// fold (zext c1) -> c1
// fold (aext c1) -> c1
if (isa<ConstantSDNode>(N0))
return DAG.getNode(Opcode, DL, VT, N0);
// fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
// fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2)
// fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
if (N0->getOpcode() == ISD::SELECT) {
SDValue Op1 = N0->getOperand(1);
SDValue Op2 = N0->getOperand(2);
if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) &&
(Opcode != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0.getValueType(), VT))) {
// For any_extend, choose sign extension of the constants to allow a
// possible further transform to sign_extend_inreg.i.e.
// t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0>
// t2: i64 = any_extend t1
// -->
// t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0>
// -->
// t4: i64 = sign_extend_inreg t3
unsigned FoldOpc = Opcode;
if (FoldOpc == ISD::ANY_EXTEND)
return DAG.getSelect(DL, VT, N0->getOperand(0),
DAG.getNode(FoldOpc, DL, VT, Op1),
DAG.getNode(FoldOpc, DL, VT, Op2));
// fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
// fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
// fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
EVT SVT = VT.getScalarType();
if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) &&
return SDValue();
// We can fold this node into a build_vector.
unsigned VTBits = SVT.getSizeInBits();
unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits();
SmallVector<SDValue, 8> Elts;
unsigned NumElts = VT.getVectorNumElements();
// For zero-extensions, UNDEF elements still guarantee to have the upper
// bits set to zero.
bool IsZext =
for (unsigned i = 0; i != NumElts; ++i) {
SDValue Op = N0.getOperand(i);
if (Op.isUndef()) {
Elts.push_back(IsZext ? DAG.getConstant(0, DL, SVT) : DAG.getUNDEF(SVT));
SDLoc DL(Op);
// Get the constant value and if needed trunc it to the size of the type.
// Nodes like build_vector might have constants wider than the scalar type.
APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
return DAG.getBuildVector(VT, DL, Elts);
// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
// "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
// transformation. Returns true if extension are possible and the above
// mentioned transformation is profitable.
static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
unsigned ExtOpc,
SmallVectorImpl<SDNode *> &ExtendNodes,
const TargetLowering &TLI) {
bool HasCopyToRegUses = false;
bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
UE = N0.getNode()->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
if (User == N)
if (UI.getUse().getResNo() != N0.getResNo())
// FIXME: Only extend SETCC N, N and SETCC N, c for now.
if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
// Sign bits will be lost after a zext.
return false;
bool Add = false;
for (unsigned i = 0; i != 2; ++i) {
SDValue UseOp = User->getOperand(i);
if (UseOp == N0)
if (!isa<ConstantSDNode>(UseOp))
return false;
Add = true;
if (Add)
// If truncates aren't free and there are users we can't
// extend, it isn't worthwhile.
if (!isTruncFree)
return false;
// Remember if this value is live-out.
if (User->getOpcode() == ISD::CopyToReg)
HasCopyToRegUses = true;
if (HasCopyToRegUses) {
bool BothLiveOut = false;
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
UI != UE; ++UI) {
SDUse &Use = UI.getUse();
if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
BothLiveOut = true;
if (BothLiveOut)
// Both unextended and extended values are live out. There had better be
// a good reason for the transformation.
return ExtendNodes.size();
return true;
void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
SDValue OrigLoad, SDValue ExtLoad,
ISD::NodeType ExtType) {
// Extend SetCC uses if necessary.
SDLoc DL(ExtLoad);
for (SDNode *SetCC : SetCCs) {
SmallVector<SDValue, 4> Ops;
for (unsigned j = 0; j != 2; ++j) {
SDValue SOp = SetCC->getOperand(j);
if (SOp == OrigLoad)
Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT DstVT = N->getValueType(0);
EVT SrcVT = N0.getValueType();
assert((N->getOpcode() == ISD::SIGN_EXTEND ||
N->getOpcode() == ISD::ZERO_EXTEND) &&
"Unexpected node type (not an extend)!");
// fold (sext (load x)) to multiple smaller sextloads; same for zext.
// For example, on a target with legal v4i32, but illegal v8i32, turn:
// (v8i32 (sext (v8i16 (load x))))
// into:
// (v8i32 (concat_vectors (v4i32 (sextload x)),
// (v4i32 (sextload (x + 16)))))
// Where uses of the original load, i.e.:
// (v8i16 (load x))
// are replaced with:
// (v8i16 (truncate
// (v8i32 (concat_vectors (v4i32 (sextload x)),
// (v4i32 (sextload (x + 16)))))))
// This combine is only applicable to illegal, but splittable, vectors.
// All legal types, and illegal non-vector types, are handled elsewhere.
// This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
if (N0->getOpcode() != ISD::LOAD)
return SDValue();
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
!N0.hasOneUse() || !LN0->isSimple() ||
!DstVT.isVector() || !DstVT.isPow2VectorType() ||
!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
return SDValue();
SmallVector<SDNode *, 4> SetCCs;
if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI))
return SDValue();
ISD::LoadExtType ExtType =
// Try to split the vector types to get down to legal types.
EVT SplitSrcVT = SrcVT;
EVT SplitDstVT = DstVT;
while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
SplitSrcVT.getVectorNumElements() > 1) {
SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
return SDValue();
assert(!DstVT.isScalableVector() && "Unexpected scalable vector type");
SDLoc DL(N);
const unsigned NumSplits =
DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
const unsigned Stride = SplitSrcVT.getStoreSize();
SmallVector<SDValue, 4> Loads;
SmallVector<SDValue, 4> Chains;
SDValue BasePtr = LN0->getBasePtr();
for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
const unsigned Offset = Idx * Stride;
const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
SDValue SplitLoad = DAG.getExtLoad(
ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr,
LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
BasePtr = DAG.getMemBasePlusOffset(BasePtr, Stride, DL);
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
// Simplify TF.
CombineTo(N, NewValue);
// Replace uses of the original load (before extension)
// with a truncate of the concatenated sextloaded vectors.
SDValue Trunc =
DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode());
CombineTo(N0.getNode(), Trunc, NewChain);
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
// (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
assert(N->getOpcode() == ISD::ZERO_EXTEND);
EVT VT = N->getValueType(0);
EVT OrigVT = N->getOperand(0).getValueType();
if (TLI.isZExtFree(OrigVT, VT))
return SDValue();
// and/or/xor
SDValue N0 = N->getOperand(0);
if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
N0.getOpcode() == ISD::XOR) ||
N0.getOperand(1).getOpcode() != ISD::Constant ||
(LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
return SDValue();
// shl/shr
SDValue N1 = N0->getOperand(0);
if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) ||
N1.getOperand(1).getOpcode() != ISD::Constant ||
(LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
return SDValue();
// load
if (!isa<LoadSDNode>(N1.getOperand(0)))
return SDValue();
LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
EVT MemVT = Load->getMemoryVT();
if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) ||
Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed())
return SDValue();
// If the shift op is SHL, the logic op must be AND, otherwise the result
// will be wrong.
if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
return SDValue();
if (!N0.hasOneUse() || !N1.hasOneUse())
return SDValue();
SmallVector<SDNode*, 4> SetCCs;
if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
return SDValue();
// Actually do the transformation.
SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
Load->getChain(), Load->getBasePtr(),
Load->getMemoryVT(), Load->getMemOperand());
SDLoc DL1(N1);
SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
SDLoc DL0(N0);
SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
DAG.getConstant(Mask, DL0, VT));
ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
CombineTo(N, And);
if (SDValue(Load, 0).hasOneUse()) {
DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
} else {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
Load->getValueType(0), ExtLoad);
CombineTo(Load, Trunc, ExtLoad.getValue(1));
// N0 is dead at this point.
return SDValue(N,0); // Return N so it doesn't get rechecked!
/// If we're narrowing or widening the result of a vector select and the final
/// size is the same size as a setcc (compare) feeding the select, then try to
/// apply the cast operation to the select's operands because matching vector
/// sizes for a select condition and other operands should be more efficient.
SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
unsigned CastOpcode = Cast->getOpcode();
assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND ||
CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND ||
CastOpcode == ISD::FP_ROUND) &&
"Unexpected opcode for vector select narrowing/widening");
// We only do this transform before legal ops because the pattern may be
// obfuscated by target-specific operations after legalization. Do not create
// an illegal select op, however, because that may be difficult to lower.
EVT VT = Cast->getValueType(0);
if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
return SDValue();
SDValue VSel = Cast->getOperand(0);
if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() ||
VSel.getOperand(0).getOpcode() != ISD::SETCC)
return SDValue();
// Does the setcc have the same vector size as the casted select?
SDValue SetCC = VSel.getOperand(0);
EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
return SDValue();
// cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
SDValue A = VSel.getOperand(1);
SDValue B = VSel.getOperand(2);
SDValue CastA, CastB;
SDLoc DL(Cast);
if (CastOpcode == ISD::FP_ROUND) {
// FP_ROUND (fptrunc) has an extra flag operand to pass along.
CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
} else {
CastA = DAG.getNode(CastOpcode, DL, VT, A);
CastB = DAG.getNode(CastOpcode, DL, VT, B);
return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
// fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
// fold ([s|z]ext ( extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
const TargetLowering &TLI, EVT VT,
bool LegalOperations, SDNode *N,
SDValue N0, ISD::LoadExtType ExtLoadType) {
SDNode *N0Node = N0.getNode();
bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node)
: ISD::isZEXTLoad(N0Node);
if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) ||
!ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse())
return SDValue();
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
EVT MemVT = LN0->getMemoryVT();
if ((LegalOperations || !LN0->isSimple() ||
VT.isVector()) &&
!TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
return SDValue();
SDValue ExtLoad =
DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
LN0->getBasePtr(), MemVT, LN0->getMemOperand());
Combiner.CombineTo(N, ExtLoad);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
if (LN0->use_empty())
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x)))
// Only generate vector extloads when 1) they're legal, and 2) they are
// deemed desirable by the target.
static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
const TargetLowering &TLI, EVT VT,
bool LegalOperations, SDNode *N, SDValue N0,
ISD::LoadExtType ExtLoadType,
ISD::NodeType ExtOpc) {
if (!ISD::isNON_EXTLoad(N0.getNode()) ||
!ISD::isUNINDEXEDLoad(N0.getNode()) ||
((LegalOperations || VT.isVector() ||
!cast<LoadSDNode>(N0)->isSimple()) &&
!TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
return {};
bool DoXform = true;
SmallVector<SDNode *, 4> SetCCs;
if (!N0.hasOneUse())
DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI);
if (VT.isVector())
DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
if (!DoXform)
return {};
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
LN0->getBasePtr(), N0.getValueType(),
Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc);
// If the load value is used only by N, replace it via CombineTo N.
bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
Combiner.CombineTo(N, ExtLoad);
if (NoReplaceTrunc) {
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
} else {
SDValue Trunc =
DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1));
return SDValue(N, 0); // Return N so it doesn't get rechecked!
static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
const TargetLowering &TLI, EVT VT,
SDNode *N, SDValue N0,
ISD::LoadExtType ExtLoadType,
ISD::NodeType ExtOpc) {
if (!N0.hasOneUse())
return SDValue();
MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0);
if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD)
return SDValue();
if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0)))
return SDValue();
if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
return SDValue();
SDLoc dl(Ld);
SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
SDValue NewLoad = DAG.getMaskedLoad(
VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(),
PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(),
ExtLoadType, Ld->isExpandingLoad());
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
return NewLoad;
static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
bool LegalOperations) {
assert((N->getOpcode() == ISD::SIGN_EXTEND ||
N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext");
SDValue SetCC = N->getOperand(0);
if (LegalOperations || SetCC.getOpcode() != ISD::SETCC ||
!SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1)
return SDValue();
SDValue X = SetCC.getOperand(0);
SDValue Ones = SetCC.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
EVT VT = N->getValueType(0);
EVT XVT = X.getValueType();
// setge X, C is canonicalized to setgt, so we do not need to match that
// pattern. The setlt sibling is folded in SimplifySelectCC() because it does
// not require the 'not' op.
if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) {
// Invert and smear/shift the sign bit:
// sext i1 (setgt iN X, -1) --> sra (not X), (N - 1)
// zext i1 (setgt iN X, -1) --> srl (not X), (N - 1)
SDLoc DL(N);
unsigned ShCt = VT.getSizeInBits() - 1;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
SDValue NotX = DAG.getNOT(DL, X, VT);
SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT);
auto ShiftOpcode =
N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL;
return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount);
return SDValue();
SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
SDLoc DL(N);
if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
return Res;
// fold (sext (sext x)) -> (sext x)
// fold (sext (aext x)) -> (sext x)
if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));
if (N0.getOpcode() == ISD::TRUNCATE) {
// fold (sext (truncate (load x))) -> (sext (smaller load x))
// fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
SDNode *oye = N0.getOperand(0).getNode();
if (NarrowLoad.getNode() != N0.getNode()) {
CombineTo(N0.getNode(), NarrowLoad);
// CombineTo deleted the truncate, if needed, but not what's under it.
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// See if the value being truncated is already sign extended. If so, just
// eliminate the trunc/sext pair.
SDValue Op = N0.getOperand(0);
unsigned OpBits = Op.getScalarValueSizeInBits();
unsigned MidBits = N0.getScalarValueSizeInBits();
unsigned DestBits = VT.getScalarSizeInBits();
unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
if (OpBits == DestBits) {
// Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign
// bits, it is already ready.
if (NumSignBits > DestBits-MidBits)
return Op;
} else if (OpBits < DestBits) {
// Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign
// bits, just sext from i32.
if (NumSignBits > OpBits-MidBits)
return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
} else {
// Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign
// bits, just truncate to i32.
if (NumSignBits > OpBits-MidBits)
return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
// fold (sext (truncate x)) -> (sextinreg x).
if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
N0.getValueType())) {
if (OpBits < DestBits)
Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
else if (OpBits > DestBits)
Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
// Try to simplify (sext (load x)).
if (SDValue foldedExt =
tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
return foldedExt;
if (SDValue foldedExt =
tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD,
return foldedExt;
// fold (sext (load x)) to multiple smaller sextloads.
// Only on illegal but splittable vectors.
if (SDValue ExtLoad = CombineExtLoad(N))
return ExtLoad;
// Try to simplify (sext (sextload x)).
if (SDValue foldedExt = tryToFoldExtOfExtload(
DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD))
return foldedExt;
// fold (sext (and/or/xor (load x), cst)) ->
// (and/or/xor (sextload x), (sext cst))
if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
N0.getOpcode() == ISD::XOR) &&
isa<LoadSDNode>(N0.getOperand(0)) &&
N0.getOperand(1).getOpcode() == ISD::Constant &&
(!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
EVT MemVT = LN00->getMemoryVT();
if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) &&
LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) {
SmallVector<SDNode*, 4> SetCCs;
bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
if (DoXform) {
SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT,
LN00->getChain(), LN00->getBasePtr(),
APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits());
SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
ExtLoad, DAG.getConstant(Mask, DL, VT));
ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);
bool NoReplaceTruncAnd = !N0.hasOneUse();
bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
CombineTo(N, And);
// If N0 has multiple uses, change other uses as well.
if (NoReplaceTruncAnd) {
SDValue TruncAnd =
DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
CombineTo(N0.getNode(), TruncAnd);
if (NoReplaceTrunc) {
DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
} else {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
LN00->getValueType(0), ExtLoad);
CombineTo(LN00, Trunc, ExtLoad.getValue(1));
return SDValue(N,0); // Return N so it doesn't get rechecked!
if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
return V;
if (N0.getOpcode() == ISD::SETCC) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
EVT N00VT = N0.getOperand(0).getValueType();
// sext(setcc) -> sext_in_reg(vsetcc) for vectors.
// Only do this before legalize for now.
if (VT.isVector() && !LegalOperations &&
TLI.getBooleanContents(N00VT) ==
TargetLowering::ZeroOrNegativeOneBooleanContent) {
// On some architectures (such as SSE/NEON/etc) the SETCC result type is
// of the same size as the compared operands. Only optimize sext(setcc())
// if this is the case.
EVT SVT = getSetCCResultType(N00VT);
// If we already have the desired type, don't change it.
if (SVT != N0.getValueType()) {
// We know that the # elements of the results is the same as the
// # elements of the compare (and the # elements of the compare result
// for that matter). Check to see that they are the same size. If so,
// we know that the element size of the sext'd result matches the
// element size of the compare operands.
if (VT.getSizeInBits() == SVT.getSizeInBits())
return DAG.getSetCC(DL, VT, N00, N01, CC);
// If the desired elements are smaller or larger than the source
// elements, we can use a matching integer vector type and then
// truncate/sign extend.
EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
if (SVT == MatchingVecType) {
SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
return DAG.getSExtOrTrunc(VsetCC, DL, VT);
// sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
// Here, T can be 1 or -1, depending on the type of the setcc and
// getBooleanContents().
unsigned SetCCWidth = N0.getScalarValueSizeInBits();
// To determine the "true" side of the select, we need to know the high bit
// of the value returned by the setcc if it evaluates to true.
// If the type of the setcc is i1, then the true case of the select is just
// sext(i1 1), that is, -1.
// If the type of the setcc is larger (say, i8) then the value of the high
// bit depends on getBooleanContents(), so ask TLI for a real "true" value
// of the appropriate width.
SDValue ExtTrueVal = (SetCCWidth == 1)
? DAG.getAllOnesConstant(DL, VT)
: DAG.getBoolConstant(true, DL, VT, N00VT);
SDValue Zero = DAG.getConstant(0, DL, VT);
if (SDValue SCC =
SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
return SCC;
if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
EVT SetCCVT = getSetCCResultType(N00VT);
// Don't do this transform for i1 because there's a select transform
// that would reverse it.
// TODO: We should not do this transform at all without a target hook
// because a sext is likely cheaper than a select?
if (SetCCVT.getScalarSizeInBits() != 1 &&
(!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) {
SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
// fold (sext x) -> (zext x) if the sign bit is known zero.
if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;
// Eliminate this sign extend by doing a negation in the destination type:
// sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64)
if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
isNullOrNullSplat(N0.getOperand(0)) &&
N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND &&
TLI.isOperationLegalOrCustom(ISD::SUB, VT)) {
SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT);
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Zext);
// Eliminate this sign extend by doing a decrement in the destination type:
// sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1)
if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
isAllOnesOrAllOnesSplat(N0.getOperand(1)) &&
N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
TLI.isOperationLegalOrCustom(ISD::ADD, VT)) {
SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
return SDValue();
// isTruncateOf - If N is a truncate of some other value, return true, record
// the value being truncated in Op and which of Op's bits are zero/one in Known.
// This function computes KnownBits to avoid a duplicated call to
// computeKnownBits in the caller.
static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
KnownBits &Known) {
if (N->getOpcode() == ISD::TRUNCATE) {
Op = N->getOperand(0);
Known = DAG.computeKnownBits(Op);
return true;
if (N.getOpcode() != ISD::SETCC ||
N.getValueType().getScalarType() != MVT::i1 ||
cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE)
return false;
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
assert(Op0.getValueType() == Op1.getValueType());
if (isNullOrNullSplat(Op0))
Op = Op1;
else if (isNullOrNullSplat(Op1))
Op = Op0;
return false;
Known = DAG.computeKnownBits(Op);
return (Known.Zero | 1).isAllOnesValue();
/// Given an extending node with a pop-count operand, if the target does not
/// support a pop-count in the narrow source type but does support it in the
/// destination type, widen the pop-count to the destination type.
static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) {
assert((Extend->getOpcode() == ISD::ZERO_EXTEND ||
Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op");
SDValue CtPop = Extend->getOperand(0);
if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse())
return SDValue();
EVT VT = Extend->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) ||
!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT))
return SDValue();
// zext (ctpop X) --> ctpop (zext X)
SDLoc DL(Extend);
SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT);
return DAG.getNode(ISD::CTPOP, DL, VT, NewZext);
SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
return Res;
// fold (zext (zext x)) -> (zext x)
// fold (zext (aext x)) -> (zext x)
if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
// fold (zext (truncate x)) -> (zext x) or
// (zext (truncate x)) -> (truncate x)
// This is valid when the truncated bits of x are already zero.
SDValue Op;
KnownBits Known;
if (isTruncateOf(DAG, N0, Op, Known)) {
APInt TruncatedBits =
(Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ?
APInt(Op.getScalarValueSizeInBits(), 0) :
if (TruncatedBits.isSubsetOf(Known.Zero))
return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
// fold (zext (truncate x)) -> (and x, mask)
if (N0.getOpcode() == ISD::TRUNCATE) {
// fold (zext (truncate (load x))) -> (zext (smaller load x))
// fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
SDNode *oye = N0.getOperand(0).getNode();
if (NarrowLoad.getNode() != N0.getNode()) {
CombineTo(N0.getNode(), NarrowLoad);
// CombineTo deleted the truncate, if needed, but not what's under it.
return SDValue(N, 0); // Return N so it doesn't get rechecked!
EVT SrcVT = N0.getOperand(0).getValueType();
EVT MinVT = N0.getValueType();
// Try to mask before the extension to avoid having to generate a larger mask,
// possibly over several sub-vectors.
if (SrcVT.bitsLT(VT) && VT.isVector()) {
if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
SDValue Op = N0.getOperand(0);
Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
// Transfer the debug info; the new node is equivalent to N0.
DAG.transferDbgValues(N0, ZExtOrTrunc);
return ZExtOrTrunc;
if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
// We may safely transfer the debug info describing the truncate node over
// to the equivalent and operation.
DAG.transferDbgValues(N0, And);
return And;
// Fold (zext (and (trunc x), cst)) -> (and x, cst),
// if either of the casts is not free.
if (N0.getOpcode() == ISD::AND &&
N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
N0.getOperand(1).getOpcode() == ISD::Constant &&
N0.getValueType()) ||
!TLI.isZExtFree(N0.getValueType(), VT))) {
SDValue X = N0.getOperand(0).getOperand(0);
X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
SDLoc DL(N);
return DAG.getNode(ISD::AND, DL, VT,
X, DAG.getConstant(Mask, DL, VT));
// Try to simplify (zext (load x)).
if (SDValue foldedExt =
tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
return foldedExt;
if (SDValue foldedExt =
tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD,
return foldedExt;
// fold (zext (load x)) to multiple smaller zextloads.
// Only on illegal but splittable vectors.
if (SDValue ExtLoad = CombineExtLoad(N))
return ExtLoad;
// fold (zext (and/or/xor (load x), cst)) ->
// (and/or/xor (zextload x), (zext cst))
// Unless (and (load x) cst) will match as a zextload already and has
// additional users.
if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
N0.getOpcode() == ISD::XOR) &&
isa<LoadSDNode>(N0.getOperand(0)) &&
N0.getOperand(1).getOpcode() == ISD::Constant &&
(!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
EVT MemVT = LN00->getMemoryVT();
if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) &&
LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) {
bool DoXform = true;
SmallVector<SDNode*, 4> SetCCs;
if (!N0.hasOneUse()) {
if (N0.getOpcode() == ISD::AND) {
auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
EVT LoadResultTy = AndC->getValueType(0);
if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT))
DoXform = false;
if (DoXform)
DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
if (DoXform) {
SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT,
LN00->getChain(), LN00->getBasePtr(),
APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
SDLoc DL(N);
SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
ExtLoad, DAG.getConstant(Mask, DL, VT));
ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
bool NoReplaceTruncAnd = !N0.hasOneUse();
bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
CombineTo(N, And);
// If N0 has multiple uses, change other uses as well.
if (NoReplaceTruncAnd) {
SDValue TruncAnd =
DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
CombineTo(N0.getNode(), TruncAnd);
if (NoReplaceTrunc) {
DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
} else {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
LN00->getValueType(0), ExtLoad);
CombineTo(LN00, Trunc, ExtLoad.getValue(1));
return SDValue(N,0); // Return N so it doesn't get rechecked!
// fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
// (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
return ZExtLoad;
// Try to simplify (zext (zextload x)).
if (SDValue foldedExt = tryToFoldExtOfExtload(
DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD))
return foldedExt;
if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
return V;
if (N0.getOpcode() == ISD::SETCC) {
// Only do this before legalize for now.
if (!LegalOperations && VT.isVector() &&
N0.getValueType().getVectorElementType() == MVT::i1) {
EVT N00VT = N0.getOperand(0).getValueType();
if (getSetCCResultType(N00VT) == N0.getValueType())
return SDValue();
// We know that the # elements of the results is the same as the #
// elements of the compare (and the # elements of the compare result for
// that matter). Check to see that they are the same size. If so, we know
// that the element size of the sext'd result matches the element size of
// the compare operands.
SDLoc DL(N);
if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
// zext(setcc) -> zext_in_reg(vsetcc) for vectors.
SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
N0.getOperand(1), N0.getOperand(2));
return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType());
// If the desired elements are smaller or larger than the source
// elements we can use a matching integer vector type and then
// truncate/any extend followed by zext_in_reg.
EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
SDValue VsetCC =
DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
N0.getOperand(1), N0.getOperand(2));
return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL,
// zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
SDLoc DL(N);
if (SDValue SCC = SimplifySelectCC(
DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
DAG.getConstant(0, DL, VT),
cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
return SCC;
// (zext (shl (zext x), cst)) -> (shl (zext x), cst)
if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
isa<ConstantSDNode>(N0.getOperand(1)) &&
N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
N0.hasOneUse()) {
SDValue ShAmt = N0.getOperand(1);
if (N0.getOpcode() == ISD::SHL) {
SDValue InnerZExt = N0.getOperand(0);
// If the original shl may be shifting out bits, do not perform this
// transformation.
unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() -
if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits))
return SDValue();
SDLoc DL(N);
// Ensure that the shift amount is wide enough for the shifted value.
if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits())
ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);
return DAG.getNode(N0.getOpcode(), DL, VT,
DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;
if (SDValue NewCtPop = widenCtPop(N, DAG))
return NewCtPop;
return SDValue();
SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
return Res;
// fold (aext (aext x)) -> (aext x)
// fold (aext (zext x)) -> (zext x)
// fold (aext (sext x)) -> (sext x)
if (N0.getOpcode() == ISD::ANY_EXTEND ||
N0.getOpcode() == ISD::ZERO_EXTEND ||
N0.getOpcode() == ISD::SIGN_EXTEND)
return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
// fold (aext (truncate (load x))) -> (aext (smaller load x))
// fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
if (N0.getOpcode() == ISD::TRUNCATE) {
if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
SDNode *oye = N0.getOperand(0).getNode();
if (NarrowLoad.getNode() != N0.getNode()) {
CombineTo(N0.getNode(), NarrowLoad);
// CombineTo deleted the truncate, if needed, but not what's under it.
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// fold (aext (truncate x))
if (N0.getOpcode() == ISD::TRUNCATE)
return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
// Fold (aext (and (trunc x), cst)) -> (and x, cst)
// if the trunc is not free.
if (N0.getOpcode() == ISD::AND &&
N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
N0.getOperand(1).getOpcode() == ISD::Constant &&
N0.getValueType())) {
SDLoc DL(N);
SDValue X = N0.getOperand(0).getOperand(0);
X = DAG.getAnyExtOrTrunc(X, DL, VT);
APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
return DAG.getNode(ISD::AND, DL, VT,
X, DAG.getConstant(Mask, DL, VT));
// fold (aext (load x)) -> (aext (truncate (extload x)))
// None of the supported targets knows how to perform load and any_ext
// on vectors in one instruction. We only perform this transformation on
// scalars.
if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
ISD::isUNINDEXEDLoad(N0.getNode()) &&
TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
bool DoXform = true;
SmallVector<SDNode*, 4> SetCCs;
if (!N0.hasOneUse())
DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs,
if (DoXform) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
LN0->getBasePtr(), N0.getValueType(),
ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
// If the load value is used only by N, replace it via CombineTo N.
bool NoReplaceTrunc = N0.hasOneUse();
CombineTo(N, ExtLoad);
if (NoReplaceTrunc) {
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
} else {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
N0.getValueType(), ExtLoad);
CombineTo(LN0, Trunc, ExtLoad.getValue(1));
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// fold (aext (zextload x)) -> (aext (truncate (zextload x)))
// fold (aext (sextload x)) -> (aext (truncate (sextload x)))
// fold (aext ( extload x)) -> (aext (truncate (extload x)))
if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) &&
ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
ISD::LoadExtType ExtType = LN0->getExtensionType();
EVT MemVT = LN0->getMemoryVT();
if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
VT, LN0->getChain(), LN0->getBasePtr(),
MemVT, LN0->getMemOperand());
CombineTo(N, ExtLoad);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
return SDValue(N, 0); // Return N so it doesn't get rechecked!
if (N0.getOpcode() == ISD::SETCC) {
// For vectors:
// aext(setcc) -> vsetcc
// aext(setcc) -> truncate(vsetcc)
// aext(setcc) -> aext(vsetcc)
// Only do this before legalize for now.
if (VT.isVector() && !LegalOperations) {
EVT N00VT = N0.getOperand(0).getValueType();
if (getSetCCResultType(N00VT) == N0.getValueType())
return SDValue();
// We know that the # elements of the results is the same as the
// # elements of the compare (and the # elements of the compare result
// for that matter). Check to see that they are the same size. If so,
// we know that the element size of the sext'd result matches the
// element size of the compare operands.
if (VT.getSizeInBits() == N00VT.getSizeInBits())
return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
// If the desired elements are smaller or larger than the source
// elements we can use a matching integer vector type and then
// truncate/any extend
EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
SDValue VsetCC =
DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
// aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
SDLoc DL(N);
if (SDValue SCC = SimplifySelectCC(
DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
DAG.getConstant(0, DL, VT),
cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
return SCC;
if (SDValue NewCtPop = widenCtPop(N, DAG))
return NewCtPop;
return SDValue();
SDValue DAGCombiner::visitAssertExt(SDNode *N) {
unsigned Opcode = N->getOpcode();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT AssertVT = cast<VTSDNode>(N1)->getVT();
// fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
if (N0.getOpcode() == Opcode &&
AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
return N0;
if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
N0.getOperand(0).getOpcode() == Opcode) {
// We have an assert, truncate, assert sandwich. Make one stronger assert
// by asserting on the smallest asserted type to the larger source type.
// This eliminates the later assert:
// assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
// assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
SDValue BigA = N0.getOperand(0);
EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
"Asserting zero/sign-extended bits to a type larger than the "
"truncated destination does not provide information");
SDLoc DL(N);
EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
BigA.getOperand(0), MinAssertVTVal);
return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
// If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller
// than X. Just move the AssertZext in front of the truncate and drop the
// AssertSExt.
if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
N0.getOperand(0).getOpcode() == ISD::AssertSext &&
Opcode == ISD::AssertZext) {
SDValue BigA = N0.getOperand(0);
EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
"Asserting zero/sign-extended bits to a type larger than the "
"truncated destination does not provide information");
if (AssertVT.bitsLT(BigA_AssertVT)) {
SDLoc DL(N);
SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
BigA.getOperand(0), N1);
return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
return SDValue();
SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
SDLoc DL(N);
Align AL = cast<AssertAlignSDNode>(N)->getAlign();
SDValue N0 = N->getOperand(0);
// Fold (assertalign (assertalign x, AL0), AL1) ->
// (assertalign x, max(AL0, AL1))
if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))
return DAG.getAssertAlign(DL, N0.getOperand(0),
std::max(AL, AAN->getAlign()));
// In rare cases, there are trivial arithmetic ops in source operands. Sink
// this assert down to source operands so that those arithmetic ops could be
// exposed to the DAG combining.
switch (N0.getOpcode()) {
case ISD::ADD:
case ISD::SUB: {
unsigned AlignShift = Log2(AL);
SDValue LHS = N0.getOperand(0);
SDValue RHS = N0.getOperand(1);
unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();
unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();
if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) {
if (LHSAlignShift < AlignShift)
LHS = DAG.getAssertAlign(DL, LHS, AL);
if (RHSAlignShift < AlignShift)
RHS = DAG.getAssertAlign(DL, RHS, AL);
return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);
return SDValue();
/// If the result of a wider load is shifted to right of N bits and then
/// truncated to a narrower type and where N is a multiple of number of bits of
/// the narrower type, transform it to a narrower load from address + N / num of
/// bits of new type. Also narrow the load if the result is masked with an AND
/// to effectively produce a smaller type. If the result is to be extended, also
/// fold the extension to form a extending load.
SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
unsigned Opc = N->getOpcode();
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// This transformation isn't valid for vector loads.
if (VT.isVector())
return SDValue();
unsigned ShAmt = 0;
bool HasShiftedOffset = false;
// Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
// extended to VT.
ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
} else if (Opc == ISD::SRL) {
// Another special-case: SRL is basically zero-extending a narrower value,
// or it maybe shifting a higher subword, half or byte into the lowest
// bits.
N0 = SDValue(N, 0);
auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));
auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!N01 || !LN0)
return SDValue();
uint64_t ShiftAmt = N01->getZExtValue();
uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits();
if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt)
ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt);
ExtVT = EVT::getIntegerVT(*DAG.getContext(),
VT.getSizeInBits() - ShiftAmt);
} else if (Opc == ISD::AND) {
// An AND with a constant mask is the same as a truncate + zero-extend.
auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!AndC)
return SDValue();
const APInt &Mask = AndC->getAPIntValue();
unsigned ActiveBits = 0;
if (Mask.isMask()) {
ActiveBits = Mask.countTrailingOnes();
} else if (Mask.isShiftedMask()) {
ShAmt = Mask.countTrailingZeros();
APInt ShiftedMask = Mask.lshr(ShAmt);
ActiveBits = ShiftedMask.countTrailingOnes();
HasShiftedOffset = true;
} else
return SDValue();
ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
SDValue SRL = N0;
if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {
ShAmt = ConstShift->getZExtValue();
unsigned EVTBits = ExtVT.getSizeInBits();
// Is the shift amount a multiple of size of VT?
if ((ShAmt & (EVTBits-1)) == 0) {
N0 = N0.getOperand(0);
// Is the load width a multiple of size of VT?
if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0)
return SDValue();
// At this point, we must have a load or else we can't do the transform.
auto *LN0 = dyn_cast<LoadSDNode>(N0);
if (!LN0) return SDValue();
// Because a SRL must be assumed to *need* to zero-extend the high bits
// (as opposed to anyext the high bits), we can't combine the zextload
// lowering of SRL and an sextload.
if (LN0->getExtensionType() == ISD::SEXTLOAD)
return SDValue();
// If the shift amount is larger than the input type then we're not
// accessing any of the loaded bytes. If the load was a zextload/extload
// then the result of the shift+trunc is zero/undef (handled elsewhere).
if (ShAmt >= LN0->getMemoryVT().getSizeInBits())
return SDValue();
// If the SRL is only used by a masking AND, we may be able to adjust
// the ExtVT to make the AND redundant.
SDNode *Mask = *(SRL->use_begin());
if (Mask->getOpcode() == ISD::AND &&
isa<ConstantSDNode>(Mask->getOperand(1))) {
const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
if (ShiftMask.isMask()) {
EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
// If the mask is smaller, recompute the type.
if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) &&
TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT))
ExtVT = MaskedVT;
// If the load is shifted left (and the result isn't shifted back right),
// we can fold the truncate through the shift.
unsigned ShLeftAmt = 0;
if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
ShLeftAmt = N01->getZExtValue();
N0 = N0.getOperand(0);
// If we haven't found a load, we can't narrow it.
if (!isa<LoadSDNode>(N0))
return SDValue();
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
// Reducing the width of a volatile load is illegal. For atomics, we may be
// able to reduce the width provided we never widen again. (see D66309)
if (!LN0->isSimple() ||
!isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
return SDValue();
auto AdjustBigEndianShift = [&](unsigned ShAmt) {
unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
return LVTStoreBits - EVTStoreBits - ShAmt;
// For big endian targets, we need to adjust the offset to the pointer to
// load the correct bytes.
if (DAG.getDataLayout().isBigEndian())
ShAmt = AdjustBigEndianShift(ShAmt);
uint64_t PtrOff = ShAmt / 8;
unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
SDLoc DL(LN0);
// The original load itself didn't wrap, so an offset within it doesn't.
SDNodeFlags Flags;
SDValue NewPtr =
DAG.getMemBasePlusOffset(LN0->getBasePtr(), PtrOff, DL, Flags);
SDValue Load;
if (ExtType == ISD::NON_EXTLOAD)
Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr,
LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr,
LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
NewAlign, LN0->getMemOperand()->getFlags(),
// Replace the old load's chain with the new load's chain.
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
// Shift the result left, if we've swallowed a left shift.
SDValue Result = Load;
if (ShLeftAmt != 0) {
EVT ShImmTy = getShiftAmountTy(Result.getValueType());
if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
ShImmTy = VT;
// If the shift amount is as large as the result size (but, presumably,
// no larger than the source) then the useful bits of the result are
// zero; we can't simply return the shortened shift, because the result
// of that operation is undefined.
if (ShLeftAmt >= VT.getSizeInBits())
Result = DAG.getConstant(0, DL, VT);
Result = DAG.getNode(ISD::SHL, DL, VT,
Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
if (HasShiftedOffset) {
// Recalculate the shift amount after it has been altered to calculate
// the offset.
if (DAG.getDataLayout().isBigEndian())
ShAmt = AdjustBigEndianShift(ShAmt);
// We're using a shifted mask, so the load now has an offset. This means
// that data has been loaded into the lower bytes than it would have been
// before, so we need to shl the loaded data into the correct position in the
// register.
SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT);
Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
// Return the new loaded value.
return Result;
SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT ExtVT = cast<VTSDNode>(N1)->getVT();
unsigned VTBits = VT.getScalarSizeInBits();
unsigned ExtVTBits = ExtVT.getScalarSizeInBits();
// sext_vector_inreg(undef) = 0 because the top bit will all be the same.
if (N0.isUndef())
return DAG.getConstant(0, SDLoc(N), VT);
// fold (sext_in_reg c1) -> c1
if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
// If the input is already sign extended, just drop the extension.
if (DAG.ComputeNumSignBits(N0) >= (VTBits - ExtVTBits + 1))
return N0;
// fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0),
// fold (sext_in_reg (sext x)) -> (sext x)
// fold (sext_in_reg (aext x)) -> (sext x)
// if x is small enough or if we know that x has more than 1 sign bit and the
// sign_extend_inreg is extending from one of them.
if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
SDValue N00 = N0.getOperand(0);
unsigned N00Bits = N00.getScalarValueSizeInBits();
if ((N00Bits <= ExtVTBits ||
(N00Bits - DAG.ComputeNumSignBits(N00)) < ExtVTBits) &&
(!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
// fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x)
if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
N0.getOperand(0).getScalarValueSizeInBits() == ExtVTBits) {
if (!LegalOperations ||
// fold (sext_in_reg (zext x)) -> (sext x)
// iff we are extending the source sign bit.
if (N0.getOpcode() == ISD::ZERO_EXTEND) {
SDValue N00 = N0.getOperand(0);
if (N00.getScalarValueSizeInBits() == ExtVTBits &&
(!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
// fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1)))
return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT);
// fold operands of sext_in_reg based on knowledge that the top bits are not
// demanded.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
// fold (sext_in_reg (load x)) -> (smaller sextload x)
// fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
if (SDValue NarrowLoad = ReduceLoadWidth(N))
return NarrowLoad;
// fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
// fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
// We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
if (N0.getOpcode() == ISD::SRL) {
if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) {
// We can turn this into an SRA iff the input to the SRL is already sign
// extended enough.
unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits)
return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),
// fold (sext_inreg (extload x)) -> (sextload x)
// If sextload is not supported by target, we can only do the combine when
// load has one use. Doing otherwise can block folding the extload with other
// extends that the target does support.
if (ISD::isEXTLoad(N0.getNode()) &&
ISD::isUNINDEXEDLoad(N0.getNode()) &&
ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
N0.hasOneUse()) ||
TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
LN0->getBasePtr(), ExtVT,
CombineTo(N, ExtLoad);
CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
N0.hasOneUse() &&
ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
LN0->getBasePtr(), ExtVT,
CombineTo(N, ExtLoad);
CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
N0.getOperand(1), false))
return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);
return SDValue();
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// sext_vector_inreg(undef) = 0 because the top bit will all be the same.
if (N0.isUndef())
return DAG.getConstant(0, SDLoc(N), VT);
if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
return Res;
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
return SDValue();
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// zext_vector_inreg(undef) = 0 because the top bits will be zero.
if (N0.isUndef())
return DAG.getConstant(0, SDLoc(N), VT);
if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
return Res;
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
return SDValue();
SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SrcVT = N0.getValueType();
bool isLE = DAG.getDataLayout().isLittleEndian();
// noop truncate
if (SrcVT == VT)
return N0;
// fold (truncate (truncate x)) -> (truncate x)
if (N0.getOpcode() == ISD::TRUNCATE)
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
// fold (truncate c1) -> c1
if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
if (C.getNode() != N)
return C;
// fold (truncate (ext x)) -> (ext x) or (truncate x) or x
if (N0.getOpcode() == ISD::ZERO_EXTEND ||
N0.getOpcode() == ISD::SIGN_EXTEND ||
N0.getOpcode() == ISD::ANY_EXTEND) {
// if the source is smaller than the dest, we still need an extend.
if (N0.getOperand(0).getValueType().bitsLT(VT))
return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
// if the source is larger than the dest, than we just need the truncate.
if (N0.getOperand(0).getValueType().bitsGT(VT))
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
// if the source and dest are the same type, we can drop both the extend
// and the truncate.
return N0.getOperand(0);
// If this is anyext(trunc), don't fold it, allow ourselves to be folded.
if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
return SDValue();
// Fold extract-and-trunc into a narrow extract. For example:
// i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
// i32 y = TRUNCATE(i64 x)
// -- becomes --
// v16i8 b = BITCAST (v2i64 val)
// i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
// Note: We only run this optimization after type legalization (which often
// creates this pattern) and before operation legalization after which
// we need to be more careful about the vector instructions that we generate.
if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
EVT VecTy = N0.getOperand(0).getValueType();
EVT ExTy = N0.getValueType();
EVT TrTy = N->getValueType(0);
unsigned NumElem = VecTy.getVectorNumElements();
unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem);
assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
SDValue EltNo = N0->getOperand(1);
if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
SDLoc DL(N);
DAG.getBitcast(NVT, N0.getOperand(0)),
DAG.getVectorIdxConstant(Index, DL));
// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
TLI.isTruncateFree(SrcVT, VT)) {
SDLoc SL(N0);
SDValue Cond = N0.getOperand(0);
SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
// trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
(!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) &&
TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
SDValue Amt = N0.getOperand(1);
KnownBits Known = DAG.computeKnownBits(Amt);
unsigned Size = VT.getScalarSizeInBits();
if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
SDLoc SL(N);
EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
if (AmtVT != Amt.getValueType()) {
Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
// Attempt to pre-truncate BUILD_VECTOR sources.
if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
// Avoid creating illegal types if running after type legalizer.
(!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
SDLoc DL(N);
EVT SVT = VT.getScalarType();
SmallVector<SDValue, 8> TruncOps;
for (const SDValue &Op : N0->op_values()) {
SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op);
return DAG.getBuildVector(VT, DL, TruncOps);
// Fold a series of buildvector, bitcast, and truncate if possible.
// For example fold
// (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
// (2xi32 (buildvector x, y)).
if (Level == AfterLegalizeVectorOps && VT.isVector() &&
N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
N0.getOperand(0).hasOneUse()) {
SDValue BuildVect = N0.getOperand(0);
EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
EVT TruncVecEltTy = VT.getVectorElementType();
// Check that the element types match.
if (BuildVectEltTy == TruncVecEltTy) {
// Now we only need to compute the offset of the truncated elements.
unsigned BuildVecNumElts = BuildVect.getNumOperands();
unsigned TruncVecNumElts = VT.getVectorNumElements();
unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;
assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
"Invalid number of elements");
SmallVector<SDValue, 8> Opnds;
for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
return DAG.getBuildVector(VT, SDLoc(N), Opnds);
// See if we can simplify the input to this truncate through knowledge that
// only the low bits are being used.
// For example "trunc (or (shl x, 8), y)" // -> trunc y
// Currently we only perform this optimization on scalars because vectors
// may have different active low bits.
if (!VT.isVector()) {
APInt Mask =
APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits());
if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask))
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
// fold (truncate (load x)) -> (smaller load x)
// fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
if (SDValue Reduced = ReduceLoadWidth(N))
return Reduced;
// Handle the case where the load remains an extending load even
// after truncation.
if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
if (LN0->isSimple() &&
LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
VT, LN0->getChain(), LN0->getBasePtr(),
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
return NewLoad;
// fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
// where ... are all 'undef'.
if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
SmallVector<EVT, 8> VTs;
SDValue V;
unsigned Idx = 0;
unsigned NumDefs = 0;
for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
SDValue X = N0.getOperand(i);
if (!X.isUndef()) {
V = X;
Idx = i;
// Stop if more than one members are non-undef.
if (NumDefs > 1)
- X.getValueType().getVectorNumElements()));
+ X.getValueType().getVectorElementCount()));
if (NumDefs == 0)
return DAG.getUNDEF(VT);
if (NumDefs == 1) {
assert(V.getNode() && "The single defined operand is empty!");
SmallVector<SDValue, 8> Opnds;
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
if (i != Idx) {
SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
// Fold truncate of a bitcast of a vector to an extract of the low vector
// element.
// e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
SDValue VecSrc = N0.getOperand(0);
EVT VecSrcVT = VecSrc.getValueType();
if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT &&
(!LegalOperations ||
TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {
SDLoc SL(N);
unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,
DAG.getVectorIdxConstant(Idx, SL));
// Simplify the operands using demanded-bits information.
if (!VT.isVector() &&
SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
// (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
// (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
// When the adde's carry is not used.
if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) &&
N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) &&
// We only do for addcarry before legalize operation
((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) ||
TLI.isOperationLegal(N0.getOpcode(), VT))) {
SDLoc SL(N);
auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
auto VTs = DAG.getVTList(VT, N0->getValueType(1));
return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
// fold (truncate (extract_subvector(ext x))) ->
// (extract_subvector x)
// TODO: This can be generalized to cover cases where the truncate and extract
// do not fully cancel each other out.
if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == ISD::SIGN_EXTEND ||
N00.getOpcode() == ISD::ZERO_EXTEND ||
N00.getOpcode() == ISD::ANY_EXTEND) {
if (N00.getOperand(0)->getValueType(0).getVectorElementType() ==
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT,
N00.getOperand(0), N0.getOperand(1));
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;
// Narrow a suitable binary operation with a non-opaque constant operand by
// moving it ahead of the truncate. This is limited to pre-legalization
// because targets may prefer a wider type during later combines and invert
// this transform.
switch (N0.getOpcode()) {
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
if (!LegalOperations && N0.hasOneUse() &&
(isConstantOrConstantVector(N0.getOperand(0), true) ||
isConstantOrConstantVector(N0.getOperand(1), true))) {
// TODO: We already restricted this to pre-legalization, but for vectors
// we are extra cautious to not create an unsupported operation.
// Target-specific changes are likely needed to avoid regressions here.
if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
SDLoc DL(N);
SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
return SDValue();
static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
SDValue Elt = N->getOperand(i);
if (Elt.getOpcode() != ISD::MERGE_VALUES)
return Elt.getNode();
return Elt.getOperand(Elt.getResNo()).getNode();
/// build_pair (load, load) -> load
/// if load locations are consecutive.
SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
assert(N->getOpcode() == ISD::BUILD_PAIR);
LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
// A BUILD_PAIR is always having the least significant part in elt 0 and the
// most significant part in elt 1. So when combining into one large load, we
// need to consider the endianness.
if (DAG.getDataLayout().isBigEndian())
std::swap(LD1, LD2);
if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() ||
LD1->getAddressSpace() != LD2->getAddressSpace())
return SDValue();
EVT LD1VT = LD1->getValueType(0);
unsigned LD1Bytes = LD1VT.getStoreSize();
if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
Align Alignment = LD1->getAlign();
Align NewAlign = DAG.getDataLayout().getABITypeAlign(
if (NewAlign <= Alignment &&
(!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
LD1->getPointerInfo(), Alignment);
return SDValue();
static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
// On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
// and Lo parts; on big-endian machines it doesn't.
return DAG.getDataLayout().isBigEndian() ? 1 : 0;
static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI) {
// If this is not a bitcast to an FP type or if the target doesn't have
// IEEE754-compliant FP logic, we're done.
EVT VT = N->getValueType(0);
if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT))
return SDValue();
// TODO: Handle cases where the integer constant is a different scalar
// bitwidth to the FP.
SDValue N0 = N->getOperand(0);
EVT SourceVT = N0.getValueType();
if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits())
return SDValue();
unsigned FPOpcode;
APInt SignMask;
switch (N0.getOpcode()) {
case ISD::AND:
FPOpcode = ISD::FABS;
SignMask = ~APInt::getSignMask(SourceVT.getScalarSizeInBits());
case ISD::XOR:
FPOpcode = ISD::FNEG;
SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
case ISD::OR:
FPOpcode = ISD::FABS;
SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
return SDValue();
// Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
// Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
// Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) ->
// fneg (fabs X)
SDValue LogicOp0 = N0.getOperand(0);
ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true);
if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
LogicOp0.getOpcode() == ISD::BITCAST &&
LogicOp0.getOperand(0).getValueType() == VT) {
SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0));
if (N0.getOpcode() == ISD::OR)
return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp);
return FPOp;
return SDValue();
SDValue DAGCombiner::visitBITCAST(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
if (N0.isUndef())
return DAG.getUNDEF(VT);
// If the input is a BUILD_VECTOR with all constant elements, fold this now.
// Only do this before legalize types, unless both types are integer and the
// scalar type is legal. Only do this before legalize ops, since the target
// maybe depending on the bitcast.
// First check to see if this is all constant.
// TODO: Support FP bitcasts after legalize types.
if (VT.isVector() &&
(!LegalTypes ||
(!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() &&
TLI.isTypeLegal(VT.getVectorElementType()))) &&
N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
// If the input is a constant, let getNode fold it.
if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
// If we can't allow illegal operations, we need to check that this is just
// a fp -> int or int -> conversion and that the resulting operation will
// be legal.
if (!LegalOperations ||
(isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
(isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
TLI.isOperationLegal(ISD::Constant, VT))) {
SDValue C = DAG.getBitcast(VT, N0);
if (C.getNode() != N)
return C;
// (conv (conv x, t1), t2) -> (conv x, t2)
if (N0.getOpcode() == ISD::BITCAST)
return DAG.getBitcast(VT, N0.getOperand(0));
// fold (conv (load x)) -> (load (conv*)x)
// If the resultant load doesn't need a higher alignment than the original!
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not remove the cast if the types differ in endian layout.
TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
// If the load is volatile, we only want to change the load type if the
// resulting load is legal. Otherwise we might increase the number of
// memory accesses. We don't care if the original type was legal or not
// as we assume software couldn't rely on the number of accesses of an
// illegal type.
((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
TLI.isOperationLegal(ISD::LOAD, VT))) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
*LN0->getMemOperand())) {
SDValue Load =
DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
LN0->getPointerInfo(), LN0->getAlignment(),
LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
return Load;
if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
return V;
// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
// For ppc_fp128:
// fold (bitcast (fneg x)) ->
// flipbit = signbit
// (xor (bitcast x) (build_pair flipbit, flipbit))
// fold (bitcast (fabs x)) ->
// flipbit = (and (extract_element (bitcast x), 0), signbit)
// (xor (bitcast x) (build_pair flipbit, flipbit))
// This often reduces constant pool loads.
if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
(N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
N0.getNode()->hasOneUse() && VT.isInteger() &&
!VT.isVector() && !N0.getValueType().isVector()) {
SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
SDLoc DL(N);
if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
assert(VT.getSizeInBits() == 128);
SDValue SignBit = DAG.getConstant(
APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
SDValue FlipBit;
if (N0.getOpcode() == ISD::FNEG) {
FlipBit = SignBit;
} else {
assert(N0.getOpcode() == ISD::FABS);
SDValue Hi =
DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
SDValue FlipBits =
DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
if (N0.getOpcode() == ISD::FNEG)
return DAG.getNode(ISD::XOR, DL, VT,
NewConv, DAG.getConstant(SignBit, DL, VT));
assert(N0.getOpcode() == ISD::FABS);
return DAG.getNode(ISD::AND, DL, VT,
NewConv, DAG.getConstant(~SignBit, DL, VT));
// fold (bitconvert (fcopysign cst, x)) ->
// (or (and (bitconvert x), sign), (and cst, (not sign)))
// Note that we don't handle (copysign x, cst) because this can always be
// folded to an fneg or fabs.
// For ppc_fp128:
// fold (bitcast (fcopysign cst, x)) ->
// flipbit = (and (extract_element
// (xor (bitcast cst), (bitcast x)), 0),
// signbit)
// (xor (bitcast cst) (build_pair flipbit, flipbit))
if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
isa<ConstantFPSDNode>(N0.getOperand(0)) &&
VT.isInteger() && !VT.isVector()) {
unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
if (isTypeLegal(IntXVT)) {
SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
// If X has a different width than the result/lhs, sext it or truncate it.
unsigned VTWidth = VT.getSizeInBits();
if (OrigXWidth < VTWidth) {
X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
} else if (OrigXWidth > VTWidth) {
// To get the sign bit in the right place, we have to shift it right
// before truncating.
SDLoc DL(X);
X = DAG.getNode(ISD::SRL, DL,
X.getValueType(), X,
DAG.getConstant(OrigXWidth-VTWidth, DL,
X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
SDValue XorResult64 = DAG.getNode(
ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
SDValue FlipBit =
DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
SDValue FlipBits =
DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
X = DAG.getNode(ISD::AND, SDLoc(X), VT,
X, DAG.getConstant(SignBit, SDLoc(X), VT));
SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
// bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
if (N0.getOpcode() == ISD::BUILD_PAIR)
if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
return CombineLD;
// Remove double bitcasts from shuffles - this is often a legacy of
// XformToShuffleWithZero being used to combine bitmaskings (of
// float vectors bitcast to integer vectors) into shuffles.
// bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
N0->getOpcode() == ISD::VECTOR_SHUFFLE && N0.hasOneUse() &&
VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
!(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
// If operands are a bitcast, peek through if it casts the original VT.
// If operands are a constant, just bitcast back to original VT.
auto PeekThroughBitcast = [&](SDValue Op) {
if (Op.getOpcode() == ISD::BITCAST &&
Op.getOperand(0).getValueType() == VT)
return SDValue(Op.getOperand(0));
if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
return DAG.getBitcast(VT, Op);
return SDValue();
// FIXME: If either input vector is bitcast, try to convert the shuffle to
// the result type of this bitcast. This would eliminate at least one
// bitcast. See the transform in InstCombine.
SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
if (!(SV0 && SV1))
return SDValue();
int MaskScale =
VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
SmallVector<int, 8> NewMask;
for (int M : SVN->getMask())
for (int i = 0; i != MaskScale; ++i)
NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);
SDValue LegalShuffle =
TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG);
if (LegalShuffle)
return LegalShuffle;
return SDValue();
SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
EVT VT = N->getValueType(0);
return CombineConsecutiveLoads(N, VT);
SDValue DAGCombiner::visitFREEZE(SDNode *N) {
SDValue N0 = N->getOperand(0);
// (freeze (freeze x)) -> (freeze x)
if (N0.getOpcode() == ISD::FREEZE)
return N0;
// If the input is a constant, return it.
if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0))
return N0;
return SDValue();
/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
/// operands. DstEltVT indicates the destination element value type.
SDValue DAGCombiner::
EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
// If this is already the right type, we're done.
if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
unsigned SrcBitSize = SrcEltVT.getSizeInBits();
unsigned DstBitSize = DstEltVT.getSizeInBits();
// If this is a conversion of N elements of one type to N elements of another
// type, convert each element. This handles FP<->INT cases.
if (SrcBitSize == DstBitSize) {
SmallVector<SDValue, 8> Ops;
for (SDValue Op : BV->op_values()) {
// If the vector element type is not legal, the BUILD_VECTOR operands
// are promoted and implicitly truncated. Make that explicit here.
if (Op.getValueType() != SrcEltVT)
Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
Ops.push_back(DAG.getBitcast(DstEltVT, Op));
EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
return DAG.getBuildVector(VT, SDLoc(BV), Ops);
// Otherwise, we're growing or shrinking the elements. To avoid having to
// handle annoying details of growing/shrinking FP values, we convert them to
// int first.
if (SrcEltVT.isFloatingPoint()) {
// Convert the input float vector to a int vector where the elements are the
// same sizes.
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
SrcEltVT = IntVT;
// Now we know the input is an integer vector. If the output is a FP type,
// convert to integer first, then to FP of the right size.
if (DstEltVT.isFloatingPoint()) {
EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
// Next, convert to FP elements of the same size.
return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
// Okay, we know the src/dst types are both integers of differing types.
// Handling growing first.
assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
if (SrcBitSize < DstBitSize) {
unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0, e = BV->getNumOperands(); i != e;
i += NumInputsPerOutput) {
bool isLE = DAG.getDataLayout().isLittleEndian();
APInt NewBits = APInt(DstBitSize, 0);
bool EltIsUndef = true;
for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
// Shift the previously computed bits over.
NewBits <<= SrcBitSize;
SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
if (Op.isUndef()) continue;
EltIsUndef = false;
NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue().
if (EltIsUndef)
Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT));
EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
return DAG.getBuildVector(VT, DL, Ops);
// Finally, this must be the case where we are shrinking elements: each input
// turns into multiple outputs.
unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
SmallVector<SDValue, 8> Ops;
for (const SDValue &Op : BV->op_values()) {
if (Op.isUndef()) {
Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
APInt OpVal = cast<ConstantSDNode>(Op)->
for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
APInt ThisVal = OpVal.trunc(DstBitSize);
Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT));
// For big endian targets, swap the order of the pieces of each element.
if (DAG.getDataLayout().isBigEndian())
std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
return DAG.getBuildVector(VT, DL, Ops);
static bool isContractable(SDNode *N) {
SDNodeFlags F = N->getFlags();
return F.hasAllowContract() || F.hasAllowReassociation();
/// Try to perform FMA combining on a given FADD node.
SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc SL(N);
const TargetOptions &Options = DAG.getTarget().Options;
// Floating-point multiply-add with intermediate rounding.
bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
// Floating-point multiply-add without intermediate rounding.
bool HasFMA =
TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
(!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
// No valid opcode, do not combine.
if (!HasFMAD && !HasFMA)
return SDValue();
SDNodeFlags Flags = N->getFlags();
bool CanFuse = Options.UnsafeFPMath || isContractable(N);
bool CanReassociate =
Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
CanFuse || HasFMAD);
// If the addition is not contractable, do not combine.
if (!AllowFusionGlobally && !isContractable(N))
return SDValue();
if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
return SDValue();
// Always prefer FMAD to FMA for precision.
unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
// Is the node an FMUL and contractable either due to global flags or
// SDNodeFlags.
auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
if (N.getOpcode() != ISD::FMUL)
return false;
return AllowFusionGlobally || isContractable(N.getNode());
// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
// prefer to fold the multiply with fewer uses.
if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
if (N0.getNode()->use_size() > N1.getNode()->use_size())
std::swap(N0, N1);
// fold (fadd (fmul x, y), z) -> (fma x, y, z)
if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
N0.getOperand(0), N0.getOperand(1), N1, Flags);
// fold (fadd x, (fmul y, z)) -> (fma y, z, x)
// Note: Commutes FADD operands.
if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
N1.getOperand(0), N1.getOperand(1), N0, Flags);
// fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E)
// fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E)
// This requires reassociation because it changes the order of operations.
SDValue FMA, E;
if (CanReassociate && N0.getOpcode() == PreferredFusedOpcode &&
N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() &&
N0.getOperand(2).hasOneUse()) {
FMA = N0;
E = N1;
} else if (CanReassociate && N1.getOpcode() == PreferredFusedOpcode &&
N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() &&
N1.getOperand(2).hasOneUse()) {
FMA = N1;
E = N0;
if (FMA && E) {
SDValue A = FMA.getOperand(0);
SDValue B = FMA.getOperand(1);
SDValue C = FMA.getOperand(2).getOperand(0);
SDValue D = FMA.getOperand(2).getOperand(1);
SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E, Flags);
return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE, Flags);
// Look through FP_EXTEND nodes to do more combining.
// fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
if (N0.getOpcode() == ISD::FP_EXTEND) {
SDValue N00 = N0.getOperand(0);
if (isContractableFMUL(N00) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N00.getValueType())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
N00.getOperand(1)), N1, Flags);
// fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
// Note: Commutes FADD operands.
if (N1.getOpcode() == ISD::FP_EXTEND) {
SDValue N10 = N1.getOperand(0);
if (isContractableFMUL(N10) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N10.getValueType())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
N10.getOperand(1)), N0, Flags);
// More folding opportunities when target permits.
if (Aggressive) {
// fold (fadd (fma x, y, (fpext (fmul u, v))), z)
// -> (fma x, y, (fma (fpext u), (fpext v), z))
auto FoldFAddFMAFPExtFMul = [&] (
SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
SDNodeFlags Flags) {
return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
DAG.getNode(PreferredFusedOpcode, SL, VT,
Z, Flags), Flags);
if (N0.getOpcode() == PreferredFusedOpcode) {
SDValue N02 = N0.getOperand(2);
if (N02.getOpcode() == ISD::FP_EXTEND) {
SDValue N020 = N02.getOperand(0);
if (isContractableFMUL(N020) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N020.getValueType())) {
return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
N020.getOperand(0), N020.getOperand(1),
N1, Flags);
// fold (fadd (fpext (fma x, y, (fmul u, v))), z)
// -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
// FIXME: This turns two single-precision and one double-precision
// operation into two double-precision operations, which might not be
// interesting for all targets, especially GPUs.
auto FoldFAddFPExtFMAFMul = [&] (
SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
SDNodeFlags Flags) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(PreferredFusedOpcode, SL, VT,
Z, Flags), Flags);
if (N0.getOpcode() == ISD::FP_EXTEND) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == PreferredFusedOpcode) {
SDValue N002 = N00.getOperand(2);
if (isContractableFMUL(N002) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N00.getValueType())) {
return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
N002.getOperand(0), N002.getOperand(1),
N1, Flags);
// fold (fadd x, (fma y, z, (fpext (fmul u, v)))
// -> (fma y, z, (fma (fpext u), (fpext v), x))
if (N1.getOpcode() == PreferredFusedOpcode) {
SDValue N12 = N1.getOperand(2);
if (N12.getOpcode() == ISD::FP_EXTEND) {
SDValue N120 = N12.getOperand(0);
if (isContractableFMUL(N120) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N120.getValueType())) {
return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
N120.getOperand(0), N120.getOperand(1),
N0, Flags);
// fold (fadd x, (fpext (fma y, z, (fmul u, v)))
// -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
// FIXME: This turns two single-precision and one double-precision
// operation into two double-precision operations, which might not be
// interesting for all targets, especially GPUs.
if (N1.getOpcode() == ISD::FP_EXTEND) {
SDValue N10 = N1.getOperand(0);
if (N10.getOpcode() == PreferredFusedOpcode) {
SDValue N102 = N10.getOperand(2);
if (isContractableFMUL(N102) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N10.getValueType())) {
return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
N102.getOperand(0), N102.getOperand(1),
N0, Flags);
return SDValue();
/// Try to perform FMA combining on a given FSUB node.
SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc SL(N);
const TargetOptions &Options = DAG.getTarget().Options;
// Floating-point multiply-add with intermediate rounding.
bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
// Floating-point multiply-add without intermediate rounding.
bool HasFMA =
TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
(!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
// No valid opcode, do not combine.
if (!HasFMAD && !HasFMA)
return SDValue();
const SDNodeFlags Flags = N->getFlags();
bool CanFuse = Options.UnsafeFPMath || isContractable(N);
bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
CanFuse || HasFMAD);
// If the subtraction is not contractable, do not combine.
if (!AllowFusionGlobally && !isContractable(N))
return SDValue();
if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
return SDValue();
// Always prefer FMAD to FMA for precision.
unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
// Is the node an FMUL and contractable either due to global flags or
// SDNodeFlags.
auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
if (N.getOpcode() != ISD::FMUL)
return false;
return AllowFusionGlobally || isContractable(N.getNode());
// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z),
return SDValue();
// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
// Note: Commutes FSUB operands.
auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
YZ.getOperand(1), X, Flags);
return SDValue();
// If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),
// prefer to fold the multiply with fewer uses.
if (isContractableFMUL(N0) && isContractableFMUL(N1) &&
(N0.getNode()->use_size() > N1.getNode()->use_size())) {
// fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))
if (SDValue V = tryToFoldXSubYZ(N0, N1))
return V;
// fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d)))
if (SDValue V = tryToFoldXYSubZ(N0, N1))
return V;
} else {
// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
if (SDValue V = tryToFoldXYSubZ(N0, N1))
return V;
// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
if (SDValue V = tryToFoldXSubYZ(N0, N1))
return V;
// fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
(Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
SDValue N00 = N0.getOperand(0).getOperand(0);
SDValue N01 = N0.getOperand(0).getOperand(1);
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
// Look through FP_EXTEND nodes to do more combining.
// fold (fsub (fpext (fmul x, y)), z)
// -> (fma (fpext x), (fpext y), (fneg z))
if (N0.getOpcode() == ISD::FP_EXTEND) {
SDValue N00 = N0.getOperand(0);
if (isContractableFMUL(N00) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N00.getValueType())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
// fold (fsub x, (fpext (fmul y, z)))
// -> (fma (fneg (fpext y)), (fpext z), x)
// Note: Commutes FSUB operands.
if (N1.getOpcode() == ISD::FP_EXTEND) {
SDValue N10 = N1.getOperand(0);
if (isContractableFMUL(N10) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N10.getValueType())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT,
N0, Flags);
// fold (fsub (fpext (fneg (fmul, x, y))), z)
// -> (fneg (fma (fpext x), (fpext y), z))
// Note: This could be removed with appropriate canonicalization of the
// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
// orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
// from implementing the canonicalization in visitFSUB.
if (N0.getOpcode() == ISD::FP_EXTEND) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == ISD::FNEG) {
SDValue N000 = N00.getOperand(0);
if (isContractableFMUL(N000) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N00.getValueType())) {
return DAG.getNode(ISD::FNEG, SL, VT,
DAG.getNode(PreferredFusedOpcode, SL, VT,
N1, Flags));
// fold (fsub (fneg (fpext (fmul, x, y))), z)
// -> (fneg (fma (fpext x)), (fpext y), z)
// Note: This could be removed with appropriate canonicalization of the
// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
// orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
// from implementing the canonicalization in visitFSUB.
if (N0.getOpcode() == ISD::FNEG) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == ISD::FP_EXTEND) {
SDValue N000 = N00.getOperand(0);
if (isContractableFMUL(N000) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N000.getValueType())) {
return DAG.getNode(ISD::FNEG, SL, VT,
DAG.getNode(PreferredFusedOpcode, SL, VT,
N1, Flags));
// More folding opportunities when target permits.
if (Aggressive) {
// fold (fsub (fma x, y, (fmul u, v)), z)
// -> (fma x, y (fma u, v, (fneg z)))
if (CanFuse && N0.getOpcode() == PreferredFusedOpcode &&
isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
N0.getOperand(2)->hasOneUse()) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
N0.getOperand(0), N0.getOperand(1),
DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT,
N1), Flags), Flags);
// fold (fsub x, (fma y, z, (fmul u, v)))
// -> (fma (fneg y), z, (fma (fneg u), v, x))
if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
isContractableFMUL(N1.getOperand(2)) &&
N1->hasOneUse() && NoSignedZero) {
SDValue N20 = N1.getOperand(2).getOperand(0);
SDValue N21 = N1.getOperand(2).getOperand(1);
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT,
DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT, N20),
N21, N0, Flags), Flags);
// fold (fsub (fma x, y, (fpext (fmul u, v))), z)
// -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
if (N0.getOpcode() == PreferredFusedOpcode &&
N0->hasOneUse()) {
SDValue N02 = N0.getOperand(2);
if (N02.getOpcode() == ISD::FP_EXTEND) {
SDValue N020 = N02.getOperand(0);
if (isContractableFMUL(N020) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N020.getValueType())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
N0.getOperand(0), N0.getOperand(1),
DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT,
N1), Flags), Flags);
// fold (fsub (fpext (fma x, y, (fmul u, v))), z)
// -> (fma (fpext x), (fpext y),
// (fma (fpext u), (fpext v), (fneg z)))
// FIXME: This turns two single-precision and one double-precision
// operation into two double-precision operations, which might not be
// interesting for all targets, especially GPUs.
if (N0.getOpcode() == ISD::FP_EXTEND) {
SDValue N00 = N0.getOperand(0);
if (N00.getOpcode() == PreferredFusedOpcode) {
SDValue N002 = N00.getOperand(2);
if (isContractableFMUL(N002) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N00.getValueType())) {
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT,
N1), Flags), Flags);
// fold (fsub x, (fma y, z, (fpext (fmul u, v))))
// -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
if (N1.getOpcode() == PreferredFusedOpcode &&
N1.getOperand(2).getOpcode() == ISD::FP_EXTEND &&
N1->hasOneUse()) {
SDValue N120 = N1.getOperand(2).getOperand(0);
if (isContractableFMUL(N120) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
N120.getValueType())) {
SDValue N1200 = N120.getOperand(0);
SDValue N1201 = N120.getOperand(1);
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT,
VT, N1200)),
N0, Flags), Flags);
// fold (fsub x, (fpext (fma y, z, (fmul u, v))))
// -> (fma (fneg (fpext y)), (fpext z),
// (fma (fneg (fpext u)), (fpext v), x))
// FIXME: This turns two single-precision and one double-precision
// operation into two double-precision operations, which might not be
// interesting for all targets, especially GPUs.
if (N1.getOpcode() == ISD::FP_EXTEND &&
N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
SDValue CvtSrc = N1.getOperand(0);
SDValue N100 = CvtSrc.getOperand(0);
SDValue N101 = CvtSrc.getOperand(1);
SDValue N102 = CvtSrc.getOperand(2);
if (isContractableFMUL(N102) &&
TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
CvtSrc.getValueType())) {
SDValue N1020 = N102.getOperand(0);
SDValue N1021 = N102.getOperand(1);
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT,
DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT,
VT, N1020)),
N0, Flags), Flags);
return SDValue();
/// Try to perform FMA combining on a given FMUL node based on the distributive
/// law x * (y + 1) = x * y + x and variants thereof (commuted versions,
/// subtraction instead of addition).
SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc SL(N);
const SDNodeFlags Flags = N->getFlags();
assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
const TargetOptions &Options = DAG.getTarget().Options;
// The transforms below are incorrect when x == 0 and y == inf, because the
// intermediate multiplication produces a nan.
if (!Options.NoInfsFPMath)
return SDValue();
// Floating-point multiply-add without intermediate rounding.
bool HasFMA =
(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
(!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
// Floating-point multiply-add with intermediate rounding. This can result
// in a less precise result due to the changed rounding order.
bool HasFMAD = Options.UnsafeFPMath &&
(LegalOperations && TLI.isFMADLegal(DAG, N));
// No valid opcode, do not combine.
if (!HasFMAD && !HasFMA)
return SDValue();
// Always prefer FMAD to FMA for precision.
unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
// fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
// fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
if (C->isExactlyValue(+1.0))
return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
Y, Flags);
if (C->isExactlyValue(-1.0))
return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
return SDValue();
if (SDValue FMA = FuseFADD(N0, N1, Flags))
return FMA;
if (SDValue FMA = FuseFADD(N1, N0, Flags))
return FMA;
// fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y)
// fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y))
// fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
// fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
if (C0->isExactlyValue(+1.0))
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
Y, Flags);
if (C0->isExactlyValue(-1.0))
return DAG.getNode(PreferredFusedOpcode, SL, VT,
DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) {
if (C1->isExactlyValue(+1.0))
return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
if (C1->isExactlyValue(-1.0))
return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
Y, Flags);
return SDValue();
if (SDValue FMA = FuseFSUB(N0, N1, Flags))
return FMA;
if (SDValue FMA = FuseFSUB(N1, N0, Flags))
return FMA;
return SDValue();
SDValue DAGCombiner::visitFADD(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
const TargetOptions &Options = DAG.getTarget().Options;
const SDNodeFlags Flags = N->getFlags();
if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
return R;
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (fadd c1, c2) -> c1 + c2
if (N0CFP && N1CFP)
return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags);
// canonicalize constant to RHS
if (N0CFP && !N1CFP)
return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);
// N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
if (N1C && N1C->isZero())
if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())
return N0;
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// fold (fadd A, (fneg B)) -> (fsub A, B)
if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
N1, DAG, LegalOperations, ForCodeSize))
return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1, Flags);
// fold (fadd (fneg A), B) -> (fsub B, A)
if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
N0, DAG, LegalOperations, ForCodeSize))
return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0, Flags);
auto isFMulNegTwo = [](SDValue FMul) {
if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
return false;
auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true);
return C && C->isExactlyValue(-2.0);
// fadd (fmul B, -2.0), A --> fsub A, (fadd B, B)
if (isFMulNegTwo(N0)) {
SDValue B = N0.getOperand(0);
SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags);
// fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B)
if (isFMulNegTwo(N1)) {
SDValue B = N1.getOperand(0);
SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags);
// No FP constant should be created after legalization as Instruction
// Selection pass has a hard time dealing with FP constants.
bool AllowNewConst = (Level < AfterLegalizeDAG);
// If nnan is enabled, fold lots of things.
if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) {
// If allowed, fold (fadd (fneg x), x) -> 0.0
if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
return DAG.getConstantFP(0.0, DL, VT);
// If allowed, fold (fadd x, (fneg x)) -> 0.0
if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
return DAG.getConstantFP(0.0, DL, VT);
// If 'unsafe math' or reassoc and nsz, fold lots of things.
// TODO: break out portions of the transformations below for which Unsafe is
// considered and which do not require both nsz and reassoc
if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
(Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
AllowNewConst) {
// fadd (fadd x, c1), c2 -> fadd x, c1 + c2
if (N1CFP && N0.getOpcode() == ISD::FADD &&
isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags);
return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags);
// We can fold chains of FADD's of the same value into multiplications.
// This transform is not safe in general because we are reducing the number
// of rounding steps.
if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
if (N0.getOpcode() == ISD::FMUL) {
bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
// (fadd (fmul x, c), x) -> (fmul x, c+1)
if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
DAG.getConstantFP(1.0, DL, VT), Flags);
return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags);
// (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
N1.getOperand(0) == N1.getOperand(1) &&
N0.getOperand(0) == N1.getOperand(0)) {
SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
DAG.getConstantFP(2.0, DL, VT), Flags);
return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags);
if (N1.getOpcode() == ISD::FMUL) {
bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
// (fadd x, (fmul x, c)) -> (fmul x, c+1)
if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
DAG.getConstantFP(1.0, DL, VT), Flags);
return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags);
// (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
N0.getOperand(0) == N0.getOperand(1) &&
N1.getOperand(0) == N0.getOperand(0)) {
SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
DAG.getConstantFP(2.0, DL, VT), Flags);
return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags);
if (N0.getOpcode() == ISD::FADD) {
bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
// (fadd (fadd x, x), x) -> (fmul x, 3.0)
if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
(N0.getOperand(0) == N1)) {
return DAG.getNode(ISD::FMUL, DL, VT,
N1, DAG.getConstantFP(3.0, DL, VT), Flags);
if (N1.getOpcode() == ISD::FADD) {
bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
// (fadd x, (fadd x, x)) -> (fmul x, 3.0)
if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
N1.getOperand(0) == N0) {
return DAG.getNode(ISD::FMUL, DL, VT,
N0, DAG.getConstantFP(3.0, DL, VT), Flags);
// (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
N0.getOperand(0) == N0.getOperand(1) &&
N1.getOperand(0) == N1.getOperand(1) &&
N0.getOperand(0) == N1.getOperand(0)) {
return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
DAG.getConstantFP(4.0, DL, VT), Flags);
} // enable-unsafe-fp-math
// FADD -> FMA combines:
if (SDValue Fused = visitFADDForFMACombine(N)) {
return Fused;
return SDValue();
SDValue DAGCombiner::visitFSUB(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true);
ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
EVT VT = N->getValueType(0);
SDLoc DL(N);
const TargetOptions &Options = DAG.getTarget().Options;
const SDNodeFlags Flags = N->getFlags();
if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
return R;
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (fsub c1, c2) -> c1-c2
if (N0CFP && N1CFP)
return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
// (fsub A, 0) -> A
if (N1CFP && N1CFP->isZero()) {
if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
Flags.hasNoSignedZeros()) {
return N0;
if (N0 == N1) {
// (fsub x, x) -> 0.0
if (Options.NoNaNsFPMath || Flags.hasNoNaNs())
return DAG.getConstantFP(0.0f, DL, VT);
// (fsub -0.0, N1) -> -N1
// NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the
// FSUB does not specify the sign bit of a NaN. Also note that for
// the same reason, the inverse transform is not safe, unless fast math
// flags are in play.
if (N0CFP && N0CFP->isZero()) {
if (N0CFP->isNegative() ||
(Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
if (SDValue NegN1 =
TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
return NegN1;
if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
(Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
N1.getOpcode() == ISD::FADD) {
// X - (X + Y) -> -Y
if (N0 == N1->getOperand(0))
return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags);
// X - (Y + X) -> -Y
if (N0 == N1->getOperand(1))
return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0), Flags);
// fold (fsub A, (fneg B)) -> (fadd A, B)
if (SDValue NegN1 =
TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1, Flags);
// FSUB -> FMA combines:
if (SDValue Fused = visitFSUBForFMACombine(N)) {
return Fused;
return SDValue();
SDValue DAGCombiner::visitFMUL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true);
ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
EVT VT = N->getValueType(0);
SDLoc DL(N);
const TargetOptions &Options = DAG.getTarget().Options;
const SDNodeFlags Flags = N->getFlags();
if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
return R;
// fold vector ops
if (VT.isVector()) {
// This just handles C1 * C2 for vectors. Other vector folds are below.
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (fmul c1, c2) -> c1*c2
if (N0CFP && N1CFP)
return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags);
// canonicalize constant to RHS
if (isConstantFPBuildVectorOrConstantFP(N0) &&
return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
if ((Options.NoNaNsFPMath && Options.NoSignedZerosFPMath) ||
(Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) {
// fold (fmul A, 0) -> 0
if (N1CFP && N1CFP->isZero())
return N1;
if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) {
// fmul (fmul X, C1), C2 -> fmul X, C1 * C2
if (isConstantFPBuildVectorOrConstantFP(N1) &&
N0.getOpcode() == ISD::FMUL) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
// Avoid an infinite loop by making sure that N00 is not a constant
// (the inner multiply has not been constant folded yet).
if (isConstantFPBuildVectorOrConstantFP(N01) &&
!isConstantFPBuildVectorOrConstantFP(N00)) {
SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags);
return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags);
// Match a special-case: we convert X * 2.0 into fadd.
// fmul (fadd X, X), C -> fmul X, 2.0 * C
if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() &&
N0.getOperand(0) == N0.getOperand(1)) {
const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
// fold (fmul X, 2.0) -> (fadd X, X)
if (N1CFP && N1CFP->isExactlyValue(+2.0))
return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags);
// fold (fmul X, -1.0) -> (fneg X)
if (N1CFP && N1CFP->isExactlyValue(-1.0))
if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
return DAG.getNode(ISD::FNEG, DL, VT, N0);
// -N0 * -N1 --> N0 * N1
TargetLowering::NegatibleCost CostN0 =
TargetLowering::NegatibleCost CostN1 =
SDValue NegN0 =
TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
SDValue NegN1 =
TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
if (NegN0 && NegN1 &&
(CostN0 == TargetLowering::NegatibleCost::Cheaper ||
CostN1 == TargetLowering::NegatibleCost::Cheaper))
return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags);
// fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
// fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
(N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) &&
TLI.isOperationLegal(ISD::FABS, VT)) {
SDValue Select = N0, X = N1;
if (Select.getOpcode() != ISD::SELECT)
std::swap(Select, X);
SDValue Cond = Select.getOperand(0);
auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));
if (TrueOpnd && FalseOpnd &&
Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
switch (CC) {
default: break;
case ISD::SETLT:
case ISD::SETLE:
std::swap(TrueOpnd, FalseOpnd);
case ISD::SETGT:
case ISD::SETGE:
if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
TLI.isOperationLegal(ISD::FNEG, VT))
return DAG.getNode(ISD::FNEG, DL, VT,
DAG.getNode(ISD::FABS, DL, VT, X));
if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
return DAG.getNode(ISD::FABS, DL, VT, X);
// FMUL -> FMA combines:
if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
return Fused;
return SDValue();
SDValue DAGCombiner::visitFMA(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
const TargetOptions &Options = DAG.getTarget().Options;
// FMA nodes have flags that propagate to the created nodes.
const SDNodeFlags Flags = N->getFlags();
bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N);
// Constant fold FMA.
if (isa<ConstantFPSDNode>(N0) &&
isa<ConstantFPSDNode>(N1) &&
isa<ConstantFPSDNode>(N2)) {
return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
// (-N0 * -N1) + N2 --> (N0 * N1) + N2
TargetLowering::NegatibleCost CostN0 =
TargetLowering::NegatibleCost CostN1 =
SDValue NegN0 =
TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
SDValue NegN1 =
TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
if (NegN0 && NegN1 &&
(CostN0 == TargetLowering::NegatibleCost::Cheaper ||
CostN1 == TargetLowering::NegatibleCost::Cheaper))
return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags);
if (UnsafeFPMath) {
if (N0CFP && N0CFP->isZero())
return N2;
if (N1CFP && N1CFP->isZero())
return N2;
// TODO: The FMA node should have flags that propagate to these nodes.
if (N0CFP && N0CFP->isExactlyValue(1.0))
return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
if (N1CFP && N1CFP->isExactlyValue(1.0))
return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
// Canonicalize (fma c, x, y) -> (fma x, c, y)
if (isConstantFPBuildVectorOrConstantFP(N0) &&
return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
if (UnsafeFPMath) {
// (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
isConstantFPBuildVectorOrConstantFP(N1) &&
isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1),
Flags), Flags);
// (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
if (N0.getOpcode() == ISD::FMUL &&
isConstantFPBuildVectorOrConstantFP(N1) &&
isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
return DAG.getNode(ISD::FMA, DL, VT,
DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1),
// (fma x, 1, y) -> (fadd x, y)
// (fma x, -1, y) -> (fadd (fneg x), y)
if (N1CFP) {
if (N1CFP->isExactlyValue(1.0))
// TODO: The FMA node should have flags that propagate to this node.
return DAG.getNode(ISD::FADD, DL, VT, N0, N2);
if (N1CFP->isExactlyValue(-1.0) &&
(!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0);
// TODO: The FMA node should have flags that propagate to this node.
return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
// fma (fneg x), K, y -> fma x -K, y
if (N0.getOpcode() == ISD::FNEG &&
(TLI.isOperationLegal(ISD::ConstantFP, VT) ||
(N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT,
ForCodeSize)))) {
return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2);
if (UnsafeFPMath) {
// (fma x, c, x) -> (fmul x, (c+1))
if (N1CFP && N0 == N2) {
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getNode(ISD::FADD, DL, VT, N1,
DAG.getConstantFP(1.0, DL, VT), Flags),
// (fma x, c, (fneg x)) -> (fmul x, (c-1))
if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getNode(ISD::FADD, DL, VT, N1,
DAG.getConstantFP(-1.0, DL, VT), Flags),
// fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))
// fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))
if (!TLI.isFNegFree(VT))
if (SDValue Neg = TLI.getCheaperNegatedExpression(
SDValue(N, 0), DAG, LegalOperations, ForCodeSize))
return DAG.getNode(ISD::FNEG, DL, VT, Neg, Flags);
return SDValue();
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal.
// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
// Notice that this is not always beneficial. One reason is different targets
// may have different costs for FDIV and FMUL, so sometimes the cost of two
// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
// TODO: Limit this transform based on optsize/minsize - it always creates at
// least 1 extra instruction. But the perf win may be substantial enough
// that only minsize should restrict this.
bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
const SDNodeFlags Flags = N->getFlags();
if (LegalDAG || (!UnsafeMath && !Flags.hasAllowReciprocal()))
return SDValue();
// Skip if current node is a reciprocal/fneg-reciprocal.
SDValue N0 = N->getOperand(0);
ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true);
if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
return SDValue();
// Exit early if the target does not want this transform or if there can't
// possibly be enough uses of the divisor to make the transform worthwhile.
SDValue N1 = N->getOperand(1);
unsigned MinUses = TLI.combineRepeatedFPDivisors();
// For splat vectors, scale the number of uses by the splat factor. If we can
// convert the division into a scalar op, that will likely be much faster.
unsigned NumElts = 1;
EVT VT = N->getValueType(0);
if (VT.isVector() && DAG.isSplatValue(N1))
NumElts = VT.getVectorNumElements();
if (!MinUses || (N1->use_size() * NumElts) < MinUses)
return SDValue();
// Find all FDIV users of the same divisor.
// Use a set because duplicates may be present in the user list.
SetVector<SDNode *> Users;
for (auto *U : N1->uses()) {
if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
// This division is eligible for optimization only if global unsafe math
// is enabled or if this division allows reciprocal formation.
if (UnsafeMath || U->getFlags().hasAllowReciprocal())
// Now that we have the actual number of divisor uses, make sure it meets
// the minimum threshold specified by the target.
if ((Users.size() * NumElts) < MinUses)
return SDValue();
SDLoc DL(N);
SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);
// Dividend / Divisor -> Dividend * Reciprocal
for (auto *U : Users) {
SDValue Dividend = U->getOperand(0);
if (Dividend != FPOne) {
SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
Reciprocal, Flags);
CombineTo(U, NewNode);
} else if (U != Reciprocal.getNode()) {
// In the absence of fast-math-flags, this user node is always the
// same node as Reciprocal, but with FMF they may be different nodes.
CombineTo(U, Reciprocal);
return SDValue(N, 0); // N was replaced.
SDValue DAGCombiner::visitFDIV(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
const TargetOptions &Options = DAG.getTarget().Options;
SDNodeFlags Flags = N->getFlags();
if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
return R;
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
return FoldedVOp;
// fold (fdiv c1, c2) -> c1/c2
if (N0CFP && N1CFP)
return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
if (SDValue V = combineRepeatedFPDivisors(N))
return V;
if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
if (N1CFP) {
// Compute the reciprocal 1.0 / c2.
const APFloat &N1APF = N1CFP->getValueAPF();
APFloat Recip(N1APF.getSemantics(), 1); // 1.0
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).
if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
(!LegalOperations ||
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
// backend)... we should handle this gracefully after Legalize.
// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
TLI.isOperationLegal(ISD::ConstantFP, VT) ||
TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getConstantFP(Recip, DL, VT), Flags);
// If this FDIV is part of a reciprocal square root, it may be folded
// into a target-specific square root estimate instruction.
if (N1.getOpcode() == ISD::FSQRT) {
if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
} else if (N1.getOpcode() == ISD::FP_EXTEND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
Flags)) {
RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
} else if (N1.getOpcode() == ISD::FP_ROUND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
Flags)) {
RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
} else if (N1.getOpcode() == ISD::FMUL) {
// Look through an FMUL. Even though this won't remove the FDIV directly,
// it's still worthwhile to get rid of the FSQRT if possible.
SDValue Sqrt, Y;
if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
Sqrt = N1.getOperand(0);
Y = N1.getOperand(1);
} else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
Sqrt = N1.getOperand(1);
Y = N1.getOperand(0);
if (Sqrt.getNode()) {
// If the other multiply operand is known positive, pull it into the
// sqrt. That will eliminate the division if we convert to an estimate:
// X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z)
// TODO: Also fold the case where A == Z (fabs is missing).
if (Flags.hasAllowReassociation() && N1.hasOneUse() &&
N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse() &&
Y.getOpcode() == ISD::FABS && Y.hasOneUse()) {
SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, Y.getOperand(0),
Y.getOperand(0), Flags);
SDValue AAZ =
DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0), Flags);
if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt, Flags);
// Estimate creation failed. Clean up speculatively created nodes.
// We found a FSQRT, so try to make this fold:
// X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y, Flags);
return DAG.getNode(ISD::FMUL, DL, VT, N0, Div, Flags);
// Fold into a reciprocal estimate and multiply instead of a real divide.
if (Options.NoInfsFPMath || Flags.hasNoInfs())
if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
return RV;
// (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
TargetLowering::NegatibleCost CostN0 =
TargetLowering::NegatibleCost CostN1 =
SDValue NegN0 =
TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
SDValue NegN1 =
TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
if (NegN0 && NegN1 &&
(CostN0 == TargetLowering::NegatibleCost::Cheaper ||
CostN1 == TargetLowering::NegatibleCost::Cheaper))
return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1, Flags);
return SDValue();
SDValue DAGCombiner::visitFREM(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
EVT VT = N->getValueType(0);
SDNodeFlags Flags = N->getFlags();
if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
return R;
// fold (frem c1, c2) -> fmod(c1,c2)
if (N0CFP && N1CFP)
return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags());
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
return SDValue();
SDValue DAGCombiner::visitFSQRT(SDNode *N) {
SDNodeFlags Flags = N->getFlags();
const TargetOptions &Options = DAG.getTarget().Options;
// Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:
// sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN
if ((!Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) ||
(!Options.NoInfsFPMath && !Flags.hasNoInfs()))
return SDValue();
SDValue N0 = N->getOperand(0);
if (TLI.isFsqrtCheap(N0, DAG))
return SDValue();
// FSQRT nodes have flags that propagate to the created nodes.
return buildSqrtEstimate(N0, Flags);
/// copysign(x, fp_extend(y)) -> copysign(x, y)
/// copysign(x, fp_round(y)) -> copysign(x, y)
static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
SDValue N1 = N->getOperand(1);
if ((N1.getOpcode() == ISD::FP_EXTEND ||
N1.getOpcode() == ISD::FP_ROUND)) {
// Do not optimize out type conversion of f128 type yet.
// For some targets like x86_64, configuration is changed to keep one f128
// value in one SSE register, but instruction selection cannot handle
// FCOPYSIGN on SSE registers yet.
EVT N1VT = N1->getValueType(0);
EVT N1Op0VT = N1->getOperand(0).getValueType();
return (N1VT == N1Op0VT || N1Op0VT != MVT::f128);
return false;
SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
EVT VT = N->getValueType(0);
if (N0CFP && N1CFP) // Constant fold
return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);
if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
const APFloat &V = N1C->getValueAPF();
// copysign(x, c1) -> fabs(x) iff ispos(c1)
// copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
if (!V.isNegative()) {
if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
} else {
if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
// copysign(fabs(x), y) -> copysign(x, y)
// copysign(fneg(x), y) -> copysign(x, y)
// copysign(copysign(x,z), y) -> copysign(x, y)
if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
N0.getOpcode() == ISD::FCOPYSIGN)
return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1);
// copysign(x, abs(y)) -> abs(x)
if (N1.getOpcode() == ISD::FABS)
return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
// copysign(x, copysign(y,z)) -> copysign(x, z)
if (N1.getOpcode() == ISD::FCOPYSIGN)
return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1));
// copysign(x, fp_extend(y)) -> copysign(x, y)
// copysign(x, fp_round(y)) -> copysign(x, y)
return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));
return SDValue();
SDValue DAGCombiner::visitFPOW(SDNode *N) {
ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1));
if (!ExponentC)
return SDValue();
// Try to convert x ** (1/3) into cube root.
// TODO: Handle the various flavors of long double.
// TODO: Since we're approximating, we don't need an exact 1/3 exponent.
// Some range near 1/3 should be fine.
EVT VT = N->getValueType(0);
if ((VT == MVT::f32 && ExponentC->getValueAPF().isExactlyValue(1.0f/3.0f)) ||
(VT == MVT::f64 && ExponentC->getValueAPF().isExactlyValue(1.0/3.0))) {
// pow(-0.0, 1/3) = +0.0; cbrt(-0.0) = -0.0.
// pow(-inf, 1/3) = +inf; cbrt(-inf) = -inf.
// pow(-val, 1/3) = nan; cbrt(-val) = -num.
// For regular numbers, rounding may cause the results to differ.
// Therefore, we require { nsz ninf nnan afn } for this transform.
// TODO: We could select out the special cases if we don't have nsz/ninf.
SDNodeFlags Flags = N->getFlags();
if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || !Flags.hasNoNaNs() ||
return SDValue();
// Do not create a cbrt() libcall if the target does not have it, and do not
// turn a pow that has lowering support into a cbrt() libcall.
if (!DAG.getLibInfo().has(LibFunc_cbrt) ||
(!DAG.getTargetLoweringInfo().isOperationExpand(ISD::FPOW, VT) &&
DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT)))
return SDValue();
return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0), Flags);
// Try to convert x ** (1/4) and x ** (3/4) into square roots.
// x ** (1/2) is canonicalized to sqrt, so we do not bother with that case.
// TODO: This could be extended (using a target hook) to handle smaller
// power-of-2 fractional exponents.
bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25);
bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75);
if (ExponentIs025 || ExponentIs075) {
// pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0.
// pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN.
// pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0.
// pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) = NaN.
// For regular numbers, rounding may cause the results to differ.
// Therefore, we require { nsz ninf afn } for this transform.
// TODO: We could select out the special cases if we don't have nsz/ninf.
SDNodeFlags Flags = N->getFlags();
// We only need no signed zeros for the 0.25 case.
if ((!Flags.hasNoSignedZeros() && ExponentIs025) || !Flags.hasNoInfs() ||
return SDValue();
// Don't double the number of libcalls. We are trying to inline fast code.
if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT))
return SDValue();
// Assume that libcalls are the smallest code.
// TODO: This restriction should probably be lifted for vectors.
if (ForCodeSize)
return SDValue();
// pow(X, 0.25) --> sqrt(sqrt(X))
SDLoc DL(N);
SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0), Flags);
SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt, Flags);
if (ExponentIs025)
return SqrtSqrt;
// pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X))
return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt, Flags);
return SDValue();
static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI) {
// This optimization is guarded by a function attribute because it may produce
// unexpected results. Ie, programs may be relying on the platform-specific
// undefined behavior when the float-to-int conversion overflows.
const Function &F = DAG.getMachineFunction().getFunction();
Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow");
if (StrictOverflow.getValueAsString().equals("false"))
return SDValue();
// We only do this if the target has legal ftrunc. Otherwise, we'd likely be
// replacing casts with a libcall. We also must be allowed to ignore -0.0
// because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
// conversions would return +0.0.
// FIXME: We should be able to use node-level FMF here.
// TODO: If strict math, should we use FABS (+ range check for signed cast)?
EVT VT = N->getValueType(0);
if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
return SDValue();
// fptosi/fptoui round towards zero, so converting from FP to integer and
// back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
SDValue N0 = N->getOperand(0);
if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
N0.getOperand(0).getValueType() == VT)
return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
N0.getOperand(0).getValueType() == VT)
return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
return SDValue();
SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT OpVT = N0.getValueType();
// [us]itofp(undef) = 0, because the result value is bounded.
if (N0.isUndef())
return DAG.getConstantFP(0.0, SDLoc(N), VT);
// fold (sint_to_fp c1) -> c1fp
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
// ...but only if the target supports immediate floating-point values
(!LegalOperations ||
TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
// If the input is a legal type, and SINT_TO_FP is not legal on this target,
// but UINT_TO_FP is legal on this target, try to convert.
if (!hasOperation(ISD::SINT_TO_FP, OpVT) &&
hasOperation(ISD::UINT_TO_FP, OpVT)) {
// If the sign bit is known to be zero, we can change this to UINT_TO_FP.
if (DAG.SignBitIsZero(N0))
return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
// The next optimizations are desirable only if SELECT_CC can be lowered.
// fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0)
if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
!VT.isVector() &&
(!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
SDLoc DL(N);
return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT),
DAG.getConstantFP(0.0, DL, VT));
// fold (sint_to_fp (zext (setcc x, y, cc))) ->
// (select (setcc x, y, cc), 1.0, 0.0)
if (N0.getOpcode() == ISD::ZERO_EXTEND &&
N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() &&
(!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
SDLoc DL(N);
return DAG.getSelect(DL, VT, N0.getOperand(0),
DAG.getConstantFP(1.0, DL, VT),
DAG.getConstantFP(0.0, DL, VT));
if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
return FTrunc;
return SDValue();
SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT OpVT = N0.getValueType();
// [us]itofp(undef) = 0, because the result value is bounded.
if (N0.isUndef())
return DAG.getConstantFP(0.0, SDLoc(N), VT);
// fold (uint_to_fp c1) -> c1fp
if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
// ...but only if the target supports immediate floating-point values
(!LegalOperations ||
TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
// If the input is a legal type, and UINT_TO_FP is not legal on this target,
// but SINT_TO_FP is legal on this target, try to convert.
if (!hasOperation(ISD::UINT_TO_FP, OpVT) &&
hasOperation(ISD::SINT_TO_FP, OpVT)) {
// If the sign bit is known to be zero, we can change this to SINT_TO_FP.
if (DAG.SignBitIsZero(N0))
return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
// fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0)
if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
(!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
SDLoc DL(N);
return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT),
DAG.getConstantFP(0.0, DL, VT));
if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
return FTrunc;
return SDValue();
// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
return SDValue();
SDValue Src = N0.getOperand(0);
EVT SrcVT = Src.getValueType();
bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;
// We can safely assume the conversion won't overflow the output range,
// because (for example) (uint8_t)18293.f is undefined behavior.
// Since we can assume the conversion won't overflow, our decision as to
// whether the input will fit in the float should depend on the minimum
// of the input range and output range.
// This means this is also safe for a signed input and unsigned output, since
// a negative input would lead to undefined behavior.
unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
unsigned ActualSize = std::min(InputSize, OutputSize);
const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
// We can only fold away the float conversion if the input range can be
// represented exactly in the float range.
if (APFloat::semanticsPrecision(sem) >= ActualSize) {
if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
return DAG.getBitcast(VT, Src);
return SDValue();
SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (fp_to_sint undef) -> undef
if (N0.isUndef())
return DAG.getUNDEF(VT);
// fold (fp_to_sint c1fp) -> c1
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
return FoldIntToFPToInt(N, DAG);
SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (fp_to_uint undef) -> undef
if (N0.isUndef())
return DAG.getUNDEF(VT);
// fold (fp_to_uint c1fp) -> c1
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
return FoldIntToFPToInt(N, DAG);
SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
EVT VT = N->getValueType(0);
// fold (fp_round c1fp) -> c1fp
if (N0CFP)
return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);
// fold (fp_round (fp_extend x)) -> x
if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
return N0.getOperand(0);
// fold (fp_round (fp_round x)) -> (fp_round x)
if (N0.getOpcode() == ISD::FP_ROUND) {
const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1;
// Skip this folding if it results in an fp_round from f80 to f16.
// f80 to f16 always generates an expensive (and as yet, unimplemented)
// libcall to __truncxfhf2 instead of selecting native f16 conversion
// instructions from f32 or f64. Moreover, the first (value-preserving)
// fp_round from f80 to either f32 or f64 may become a NOP in platforms like
// x86.
if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
return SDValue();
// If the first fp_round isn't a value preserving truncation, it might
// introduce a tie in the second fp_round, that wouldn't occur in the
// single-step fp_round we want to fold to.
// In other words, double rounding isn't the same as rounding.
// Also, this is a value preserving truncation iff both fp_round's are.
if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) {
SDLoc DL(N);
return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0),
DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL));
// fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
N0.getOperand(0), N1);
return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
Tmp, N0.getOperand(1));
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;
return SDValue();
SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
if (N->hasOneUse() &&
N->use_begin()->getOpcode() == ISD::FP_ROUND)
return SDValue();
// fold (fp_extend c1fp) -> c1fp
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);
// fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
if (N0.getOpcode() == ISD::FP16_TO_FP &&
TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));
// Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
// value of X.
if (N0.getOpcode() == ISD::FP_ROUND
&& N0.getConstantOperandVal(1) == 1) {
SDValue In = N0.getOperand(0);
if (In.getValueType() == VT) return In;
if (VT.bitsLT(In.getValueType()))
return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
In, N0.getOperand(1));
return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
LN0->getBasePtr(), N0.getValueType(),
CombineTo(N, ExtLoad);
DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
N0.getValueType(), ExtLoad,
DAG.getIntPtrConstant(1, SDLoc(N0))),
return SDValue(N, 0); // Return N so it doesn't get rechecked!
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;
return SDValue();
SDValue DAGCombiner::visitFCEIL(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (fceil c1) -> fceil(c1)
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
return SDValue();
SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (ftrunc c1) -> ftrunc(c1)
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
// fold ftrunc (known rounded int x) -> x
// ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
// likely to be generated to extract integer from a rounded floating value.
switch (N0.getOpcode()) {
default: break;
case ISD::FRINT:
case ISD::FCEIL:
return N0;
return SDValue();
SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (ffloor c1) -> ffloor(c1)
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
return SDValue();
// FIXME: FNEG and FABS have a lot in common; refactor.
SDValue DAGCombiner::visitFNEG(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// Constant fold FNEG.
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
if (SDValue NegN0 =
TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize))
return NegN0;
// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
// FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
// know it was called from a context with a nsz flag if the input fsub does
// not.
if (N0.getOpcode() == ISD::FSUB &&
(DAG.getTarget().Options.NoSignedZerosFPMath ||
N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {
return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
N0.getOperand(0), N->getFlags());
// Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
// constant pool values.
if (!TLI.isFNegFree(VT) &&
N0.getOpcode() == ISD::BITCAST &&
N0.getNode()->hasOneUse()) {
SDValue Int = N0.getOperand(0);
EVT IntVT = Int.getValueType();
if (IntVT.isInteger() && !IntVT.isVector()) {
APInt SignMask;
if (N0.getValueType().isVector()) {
// For a vector, get a mask such as 0x80... per scalar element
// and splat it.
SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
} else {
// For a scalar, just generate 0x80...
SignMask = APInt::getSignMask(IntVT.getSizeInBits());
SDLoc DL0(N0);
Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
DAG.getConstant(SignMask, DL0, IntVT));
return DAG.getBitcast(VT, Int);
// (fneg (fmul c, x)) -> (fmul -c, x)
if (N0.getOpcode() == ISD::FMUL &&
(N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) {
ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
if (CFP1) {
APFloat CVal = CFP1->getValueAPF();
if (LegalDAG && (TLI.isFPImmLegal(CVal, VT, ForCodeSize) ||
TLI.isOperationLegal(ISD::ConstantFP, VT)))
return DAG.getNode(
ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)),
return SDValue();
static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N,
APFloat (*Op)(const APFloat &, const APFloat &)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
if (N0CFP && N1CFP) {
const APFloat &C0 = N0CFP->getValueAPF();
const APFloat &C1 = N1CFP->getValueAPF();
return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT);
// Canonicalize to constant on RHS.
if (isConstantFPBuildVectorOrConstantFP(N0) &&
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
return SDValue();
SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
return visitFMinMax(DAG, N, minnum);
SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
return visitFMinMax(DAG, N, maxnum);
SDValue DAGCombiner::visitFMINIMUM(SDNode *N) {
return visitFMinMax(DAG, N, minimum);
SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) {
return visitFMinMax(DAG, N, maximum);
SDValue DAGCombiner::visitFABS(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (fabs c1) -> fabs(c1)
if (isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
// fold (fabs (fabs x)) -> (fabs x)
if (N0.getOpcode() == ISD::FABS)
return N->getOperand(0);
// fold (fabs (fneg x)) -> (fabs x)
// fold (fabs (fcopysign x, y)) -> (fabs x)
if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
// fabs(bitcast(x)) -> bitcast(x & ~sign) to avoid constant pool loads.
if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
SDValue Int = N0.getOperand(0);
EVT IntVT = Int.getValueType();
if (IntVT.isInteger() && !IntVT.isVector()) {
APInt SignMask;
if (N0.getValueType().isVector()) {
// For a vector, get a mask such as 0x7f... per scalar element
// and splat it.
SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits());
SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
} else {
// For a scalar, just generate 0x7f...
SignMask = ~APInt::getSignMask(IntVT.getSizeInBits());
SDLoc DL(N0);
Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
DAG.getConstant(SignMask, DL, IntVT));
return DAG.getBitcast(N->getValueType(0), Int);
return SDValue();
SDValue DAGCombiner::visitBRCOND(SDNode *N) {
SDValue Chain = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
// If N is a constant we could fold this into a fallthrough or unconditional
// branch. However that doesn't happen very often in normal code, because
// Instcombine/SimplifyCFG should have handled the available opportunities.
// If we did this folding here, it would be necessary to update the
// MachineBasicBlock CFG, which is awkward.
// fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
// on the target.
if (N1.getOpcode() == ISD::SETCC &&
N1.getOperand(0).getValueType())) {
return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
Chain, N1.getOperand(2),
N1.getOperand(0), N1.getOperand(1), N2);
if (N1.hasOneUse()) {
// rebuildSetCC calls visitXor which may change the Chain when there is a
// STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes.
HandleSDNode ChainHandle(Chain);
if (SDValue NewN1 = rebuildSetCC(N1))
return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other,
ChainHandle.getValue(), NewN1, N2);
return SDValue();
SDValue DAGCombiner::rebuildSetCC(SDValue N) {
if (N.getOpcode() == ISD::SRL ||
(N.getOpcode() == ISD::TRUNCATE &&
(N.getOperand(0).hasOneUse() &&
N.getOperand(0).getOpcode() == ISD::SRL))) {
// Look pass the truncate.
if (N.getOpcode() == ISD::TRUNCATE)
N = N.getOperand(0);
// Match this pattern so that we can generate simpler code:
// %a = ...
// %b = and i32 %a, 2
// %c = srl i32 %b, 1
// brcond i32 %c ...
// into
// %a = ...
// %b = and i32 %a, 2
// %c = setcc eq %b, 0
// brcond %c ...
// This applies only when the AND constant value has one bit set and the
// SRL constant is equal to the log2 of the AND constant. The back-end is
// smart enough to convert the result into a TEST/JMP sequence.
SDValue Op0 = N.getOperand(0);
SDValue Op1 = N.getOperand(1);
if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) {
SDValue AndOp1 = Op0.getOperand(1);
if (AndOp1.getOpcode() == ISD::Constant) {
const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
if (AndConst.isPowerOf2() &&
cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) {
SDLoc DL(N);
return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()),
Op0, DAG.getConstant(0, DL, Op0.getValueType()),
// Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne))
// Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq))
if (N.getOpcode() == ISD::XOR) {
// Because we may call this on a speculatively constructed
// SimplifiedSetCC Node, we need to simplify this node first.
// Ideally this should be folded into SimplifySetCC and not
// here. For now, grab a handle to N so we don't lose it from
// replacements interal to the visit.
HandleSDNode XORHandle(N);
while (N.getOpcode() == ISD::XOR) {
SDValue Tmp = visitXOR(N.getNode());
// No simplification done.
if (!Tmp.getNode())
// Returning N is form in-visit replacement that may invalidated
// N. Grab value from Handle.
if (Tmp.getNode() == N.getNode())
N = XORHandle.getValue();
else // Node simplified. Try simplifying again.
N = Tmp;
if (N.getOpcode() != ISD::XOR)
return N;
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
bool Equal = false;
// (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))
if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR &&
Op0.getValueType() == MVT::i1) {
N = Op0;
Op0 = N->getOperand(0);
Op1 = N->getOperand(1);
Equal = true;
EVT SetCCVT = N.getValueType();
if (LegalTypes)
SetCCVT = getSetCCResultType(SetCCVT);
// Replace the uses of XOR with SETCC
return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1,
return SDValue();
// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
SDValue DAGCombiner::visitBR_CC(SDNode *N) {
CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
// If N is a constant we could fold this into a fallthrough or unconditional
// branch. However that doesn't happen very often in normal code, because
// Instcombine/SimplifyCFG should have handled the available opportunities.
// If we did this folding here, it would be necessary to update the
// MachineBasicBlock CFG, which is awkward.
// Use SimplifySetCC to simplify SETCC's.
SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
CondLHS, CondRHS, CC->get(), SDLoc(N),
if (Simp.getNode()) AddToWorklist(Simp.getNode());
// fold to a simpler setcc
if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
N->getOperand(0), Simp.getOperand(2),
Simp.getOperand(0), Simp.getOperand(1),
return SDValue();
/// Return true if 'Use' is a load or a store that uses N as its base pointer
/// and that N may be folded in the load / store addressing mode.
static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
SelectionDAG &DAG,
const TargetLowering &TLI) {
unsigned AS;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
return false;
VT = LD->getMemoryVT();
AS = LD->getAddressSpace();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
return false;
VT = ST->getMemoryVT();
AS = ST->getAddressSpace();
} else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
return false;
VT = LD->getMemoryVT();
AS = LD->getAddressSpace();
} else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
return false;
VT = ST->getMemoryVT();
AS = ST->getAddressSpace();
} else
return false;
TargetLowering::AddrMode AM;
if (N->getOpcode() == ISD::ADD) {
AM.HasBaseReg = true;
ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (Offset)
// [reg +/- imm]
AM.BaseOffs = Offset->getSExtValue();
// [reg +/- reg]
AM.Scale = 1;
} else if (N->getOpcode() == ISD::SUB) {
AM.HasBaseReg = true;
ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (Offset)
// [reg +/- imm]
AM.BaseOffs = -Offset->getSExtValue();
// [reg +/- reg]
AM.Scale = 1;
} else
return false;
return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
VT.getTypeForEVT(*DAG.getContext()), AS);
static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec,
bool &IsLoad, bool &IsMasked, SDValue &Ptr,
const TargetLowering &TLI) {
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
if (LD->isIndexed())
return false;
EVT VT = LD->getMemoryVT();
if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT))
return false;
Ptr = LD->getBasePtr();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
if (ST->isIndexed())
return false;
EVT VT = ST->getMemoryVT();
if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT))
return false;
Ptr = ST->getBasePtr();
IsLoad = false;
} else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
if (LD->isIndexed())
return false;
EVT VT = LD->getMemoryVT();
if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) &&
!TLI.isIndexedMaskedLoadLegal(Dec, VT))
return false;
Ptr = LD->getBasePtr();
IsMasked = true;
} else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
if (ST->isIndexed())
return false;
EVT VT = ST->getMemoryVT();
if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) &&
!TLI.isIndexedMaskedStoreLegal(Dec, VT))
return false;
Ptr = ST->getBasePtr();
IsLoad = false;
IsMasked = true;
} else {
return false;
return true;
/// Try turning a load/store into a pre-indexed load/store when the base
/// pointer is an add or subtract and it has other uses besides the load/store.
/// After the transformation, the new indexed load/store has effectively folded
/// the add/subtract in and all of its other uses are redirected to the
/// new load/store.
bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
if (Level < AfterLegalizeDAG)
return false;
bool IsLoad = true;
bool IsMasked = false;
SDValue Ptr;
if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked,
Ptr, TLI))
return false;
// If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
// out. There is no reason to make this a preinc/predec.
if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
return false;
// Ask the target to do addressing mode selection.
SDValue BasePtr;
SDValue Offset;
ISD::MemIndexedMode AM = ISD::UNINDEXED;
if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
return false;
// Backends without true r+i pre-indexed forms may need to pass a
// constant base with a variable offset so that constant coercion
// will work with the patterns in canonical form.
bool Swapped = false;
if (isa<ConstantSDNode>(BasePtr)) {
std::swap(BasePtr, Offset);
Swapped = true;
// Don't create a indexed load / store with zero offset.
if (isNullConstant(Offset))
return false;
// Try turning it into a pre-indexed load / store except when:
// 1) The new base ptr is a frame index.
// 2) If N is a store and the new base ptr is either the same as or is a
// predecessor of the value being stored.
// 3) Another use of old base ptr is a predecessor of N. If ptr is folded
// that would create a cycle.
// 4) All uses are load / store ops that use it as old base ptr.
// Check #1. Preinc'ing a frame index would require copying the stack pointer
// (plus the implicit offset) to a register to preinc anyway.
if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
return false;
// Check #2.
if (!IsLoad) {
SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue()
: cast<StoreSDNode>(N)->getValue();
// Would require a copy.
if (Val == BasePtr)
return false;
// Would create a cycle.
if (Val == Ptr || Ptr->isPredecessorOf(Val.getNode()))
return false;
// Caches for hasPredecessorHelper.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
// If the offset is a constant, there may be other adds of constants that
// can be folded with this one. We should do this to avoid having to keep
// a copy of the original base pointer.
SmallVector<SDNode *, 16> OtherUses;
if (isa<ConstantSDNode>(Offset))
for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(),
UE = BasePtr.getNode()->use_end();
UI != UE; ++UI) {
SDUse &Use = UI.getUse();
// Skip the use that is Ptr and uses of other results from BasePtr's
// node (important for nodes that return multiple results).
if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
if (Use.getUser()->getOpcode() != ISD::ADD &&
Use.getUser()->getOpcode() != ISD::SUB) {
SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
if (!isa<ConstantSDNode>(Op1)) {
// FIXME: In some cases, we can be smarter about this.
if (Op1.getValueType() != Offset.getValueType()) {
if (Swapped)
std::swap(BasePtr, Offset);
// Now check for #3 and #4.
bool RealUse = false;
for (SDNode *Use : Ptr.getNode()->uses()) {
if (Use == N)
if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
return false;
// If Ptr may be folded in addressing mode of other use, then it's
// not profitable to do this transformation.
if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
RealUse = true;
if (!RealUse)
return false;
SDValue Result;
if (!IsMasked) {
if (IsLoad)
Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
Result =
DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
} else {
if (IsLoad)
Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
Offset, AM);
Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr,
Offset, AM);
LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
Result.getNode()->dump(&DAG); dbgs() << '\n');
WorklistRemover DeadNodes(*this);
if (IsLoad) {
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
} else {
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
// Finally, since the node is now dead, remove it from the graph.
if (Swapped)
std::swap(BasePtr, Offset);
// Replace other uses of BasePtr that can be updated to use Ptr
for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
unsigned OffsetIdx = 1;
if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
OffsetIdx = 0;
assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
BasePtr.getNode() && "Expected BasePtr operand");
// We need to replace ptr0 in the following expression:
// x0 * offset0 + y0 * ptr0 = t0
// knowing that
// x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
// where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
// indexed load/store and the expression that needs to be re-written.
// Therefore, we have:
// t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
ConstantSDNode *CN =
int X0, X1, Y0, Y1;
const APInt &Offset0 = CN->getAPIntValue();
APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;
APInt CNV = Offset0;
if (X0 < 0) CNV = -CNV;
if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
else CNV = CNV - Offset1;
SDLoc DL(OtherUses[i]);
// We can now generate the new expression.
SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0);
SDValue NewUse = DAG.getNode(Opcode,
OtherUses[i]->getValueType(0), NewOp1, NewOp2);
DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
// Replace the uses of Ptr with uses of the updated base value.
DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0));
return true;
static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
SDValue &BasePtr, SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG,
const TargetLowering &TLI) {
if (PtrUse == N ||
(PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB))
return false;
if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG))
return false;
// Don't create a indexed load / store with zero offset.
if (isNullConstant(Offset))
return false;
if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
return false;
SmallPtrSet<const SDNode *, 32> Visited;
for (SDNode *Use : BasePtr.getNode()->uses()) {
if (Use == Ptr.getNode())
// No if there's a later user which could perform the index instead.
if (isa<MemSDNode>(Use)) {
bool IsLoad = true;
bool IsMasked = false;
SDValue OtherPtr;
if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
IsMasked, OtherPtr, TLI)) {
SmallVector<const SDNode *, 2> Worklist;
if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
return false;
// If all the uses are load / store addresses, then don't do the
// transformation.
if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
for (SDNode *UseUse : Use->uses())
if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))
return false;
return true;
static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
bool &IsMasked, SDValue &Ptr,
SDValue &BasePtr, SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG,
const TargetLowering &TLI) {
if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
IsMasked, Ptr, TLI) ||
return nullptr;
// Try turning it into a post-indexed load / store except when
// 1) All uses are load / store ops that use it as base ptr (and
// it may be folded as addressing mmode).
// 2) Op must be independent of N, i.e. Op is neither a predecessor
// nor a successor of N. Otherwise, if Op is folded that would
// create a cycle.
for (SDNode *Op : Ptr->uses()) {
// Check for #1.
if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
// Check for #2.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 8> Worklist;
// Ptr is predecessor to both N and Op.
if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
!SDNode::hasPredecessorHelper(Op, Visited, Worklist))
return Op;
return nullptr;
/// Try to combine a load/store with a add/sub of the base pointer node into a
/// post-indexed load/store. The transformation folded the add/subtract into the
/// new indexed load/store effectively and all of its uses are redirected to the
/// new load/store.
bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
if (Level < AfterLegalizeDAG)
return false;
bool IsLoad = true;
bool IsMasked = false;
SDValue Ptr;
SDValue BasePtr;
SDValue Offset;
ISD::MemIndexedMode AM = ISD::UNINDEXED;
SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr,
Offset, AM, DAG, TLI);
if (!Op)
return false;
SDValue Result;
if (!IsMasked)
Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
Offset, AM)
: DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
BasePtr, Offset, AM);
Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
BasePtr, Offset, AM)
: DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
BasePtr, Offset, AM);
LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
dbgs() << '\n');
WorklistRemover DeadNodes(*this);
if (IsLoad) {
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
} else {
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
// Finally, since the node is now dead, remove it from the graph.
// Replace the uses of Use with uses of the updated base value.
DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
Result.getValue(IsLoad ? 1 : 0));
return true;
/// Return the base-pointer arithmetic from an indexed \p LD.
SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
ISD::MemIndexedMode AM = LD->getAddressingMode();
assert(AM != ISD::UNINDEXED);
SDValue BP = LD->getOperand(1);
SDValue Inc = LD->getOperand(2);
// Some backends use TargetConstants for load offsets, but don't expect
// TargetConstants in general ADD nodes. We can convert these constants into
// regular Constants (if the constant is not opaque).
assert((Inc.getOpcode() != ISD::TargetConstant ||
!cast<ConstantSDNode>(Inc)->isOpaque()) &&
"Cannot split out indexing using opaque target constants");
if (Inc.getOpcode() == ISD::TargetConstant) {
ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
unsigned Opc =
return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
static inline int numVectorEltsOrZero(EVT T) {
return T.isVector() ? T.getVectorNumElements() : 0;
bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
Val = ST->getValue();
EVT STType = Val.getValueType();
EVT STMemType = ST->getMemoryVT();
if (STType == STMemType)
return true;
if (isTypeLegal(STMemType))
return false; // fail.
if (STType.isFloatingPoint() && STMemType.isFloatingPoint() &&
TLI.isOperationLegal(ISD::FTRUNC, STMemType)) {
Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val);
return true;
if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) &&
STType.isInteger() && STMemType.isInteger()) {
Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val);
return true;
if (STType.getSizeInBits() == STMemType.getSizeInBits()) {
Val = DAG.getBitcast(STMemType, Val);
return true;
return false; // fail.
bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
EVT LDMemType = LD->getMemoryVT();
EVT LDType = LD->getValueType(0);
assert(Val.getValueType() == LDMemType &&
"Attempting to extend value of non-matching type");
if (LDType == LDMemType)
return true;
if (LDMemType.isInteger() && LDType.isInteger()) {
switch (LD->getExtensionType()) {
Val = DAG.getBitcast(LDType, Val);
return true;
Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val);
return true;
Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val);
return true;
Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val);
return true;
return false;
SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
if (OptLevel == CodeGenOpt::None || !LD->isSimple())
return SDValue();
SDValue Chain = LD->getOperand(0);
StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
// TODO: Relax this restriction for unordered atomics (see D66309)
if (!ST || !ST->isSimple())
return SDValue();
EVT LDType = LD->getValueType(0);
EVT LDMemType = LD->getMemoryVT();
EVT STMemType = ST->getMemoryVT();
EVT STType = ST->getValue().getValueType();
BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
int64_t Offset;
if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
return SDValue();
// Normalize for Endianness. After this Offset=0 will denote that the least
// significant bit in the loaded value maps to the least significant bit in
// the stored value). With Offset=n (for n > 0) the loaded value starts at the
// n:th least significant byte of the stored value.
if (DAG.getDataLayout().isBigEndian())
Offset = ((int64_t)STMemType.getStoreSizeInBits() -
(int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset;
// Check that the stored value cover all bits that are loaded.
bool STCoversLD =
(Offset >= 0) &&
(Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
if (LD->isIndexed()) {
// Cannot handle opaque target constants and we must respect the user's
// request not to split indexes from loads.
if (!canSplitIdx(LD))
return SDValue();
SDValue Idx = SplitIndexingFromLoad(LD);
SDValue Ops[] = {Val, Idx, Chain};
return CombineTo(LD, Ops, 3);
return CombineTo(LD, Val, Chain);
if (!STCoversLD)
return SDValue();
// Memory as copy space (potentially masked).
if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
// Simple case: Direct non-truncating forwarding
if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
return ReplaceLd(LD, ST->getValue(), Chain);
// Can we model the truncate and extension with an and mask?
if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
!LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
// Mask to size of LDMemType
auto Mask =
SDLoc(ST), STType);
auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
return ReplaceLd(LD, Val, Chain);
// TODO: Deal with nonzero offset.
if (LD->getBasePtr().isUndef() || Offset != 0)
return SDValue();
// Model necessary truncations / extenstions.
SDValue Val;
// Truncate Value To Stored Memory Size.
do {
if (!getTruncatedStoreValue(ST, Val))
if (!isTypeLegal(LDMemType))
if (STMemType != LDMemType) {
// TODO: Support vectors? This requires extract_subvector/bitcast.
if (!STMemType.isVector() && !LDMemType.isVector() &&
STMemType.isInteger() && LDMemType.isInteger())
Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
if (!extendLoadedValueToExtension(LD, Val))
return ReplaceLd(LD, Val, Chain);
} while (false);
// On failure, cleanup dead nodes we may have created.
if (Val->use_empty())
return SDValue();
SDValue DAGCombiner::visitLOAD(SDNode *N) {
LoadSDNode *LD = cast<LoadSDNode>(N);
SDValue Chain = LD->getChain();
SDValue Ptr = LD->getBasePtr();
// If load is not volatile and there are no uses of the loaded value (and
// the updated indexed value in case of indexed loads), change uses of the
// chain value into uses of the chain input (i.e. delete the dead load).
// TODO: Allow this for unordered atomics (see D66309)
if (LD->isSimple()) {
if (N->getValueType(1) == MVT::Other) {
// Unindexed loads.
if (!N->hasAnyUseOfValue(0)) {
// It's not safe to use the two value CombineTo variant here. e.g.
// v1, chain2 = load chain1, loc
// v2, chain3 = load chain2, loc
// v3 = add v2, c
// Now we replace use of chain2 with chain1. This makes the second load
// isomorphic to the one we are deleting, and thus makes this load live.
LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG);
dbgs() << "\n");
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
if (N->use_empty())
return SDValue(N, 0); // Return N so it doesn't get rechecked!
} else {
// Indexed loads.
assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
// If this load has an opaque TargetConstant offset, then we cannot split
// the indexing into an add/sub directly (that TargetConstant may not be
// valid for a different type of node, and we cannot convert an opaque
// target constant into a regular constant).
bool CanSplitIdx = canSplitIdx(LD);
if (!N->hasAnyUseOfValue(0) && (CanSplitIdx || !N->hasAnyUseOfValue(1))) {
SDValue Undef = DAG.getUNDEF(N->getValueType(0));
SDValue Index;
if (N->hasAnyUseOfValue(1) && CanSplitIdx) {
Index = SplitIndexingFromLoad(LD);
// Try to fold the base pointer arithmetic into subsequent loads and
// stores.
} else
Index = DAG.getUNDEF(N->getValueType(1));
LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG);
dbgs() << " and 2 other values\n");
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
return SDValue(N, 0); // Return N so it doesn't get rechecked!
// If this load is directly stored, replace the load value with the stored
// value.
if (auto V = ForwardStoreValueToDirectLoad(LD))
return V;
// Try to infer better alignment information than the load already has.
if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) {
if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
if (*Alignment > LD->getAlign() &&
isAligned(*Alignment, LD->getSrcValueOffset())) {
SDValue NewLoad = DAG.getExtLoad(
LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
LD->getPointerInfo(), LD->getMemoryVT(), *Alignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
// NewLoad will always be N as we are only refining the alignment
assert(NewLoad.getNode() == N);
if (LD->isUnindexed()) {
// Walk up chain skipping non-aliasing memory nodes.
SDValue BetterChain = FindBetterChain(LD, Chain);
// If there is a better chain.
if (Chain != BetterChain) {
SDValue ReplLoad;
// Replace the chain to void dependency.
if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
BetterChain, Ptr, LD->getMemOperand());
} else {
ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
BetterChain, Ptr, LD->getMemoryVT(),
// Create token factor to keep old chain connected.
SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
MVT::Other, Chain, ReplLoad.getValue(1));
// Replace uses with load result and token factor
return CombineTo(N, ReplLoad.getValue(0), Token);
// Try transforming N to an indexed load.
if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
return SDValue(N, 0);
// Try to slice up N to more direct loads if the slices are mapped to
// different register banks or pairing can take place.
if (SliceUpLoad(N))
return SDValue(N, 0);
return SDValue();
namespace {
/// Helper structure used to slice a load in smaller loads.
/// Basically a slice is obtained from the following sequence:
/// Origin = load Ty1, Base
/// Shift = srl Ty1 Origin, CstTy Amount
/// Inst = trunc Shift to Ty2
/// Then, it will be rewritten into:
/// Slice = load SliceTy, Base + SliceOffset
/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
/// SliceTy is deduced from the number of bits that are actually used to
/// build Inst.
struct LoadedSlice {
/// Helper structure used to compute the cost of a slice.
struct Cost {
/// Are we optimizing for code size.
bool ForCodeSize = false;
/// Various cost.
unsigned Loads = 0;
unsigned Truncates = 0;
unsigned CrossRegisterBanksCopies = 0;
unsigned ZExts = 0;
unsigned Shift = 0;
explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {}
/// Get the cost of one isolated slice.
Cost(const LoadedSlice &LS, bool ForCodeSize)
: ForCodeSize(ForCodeSize), Loads(1) {
EVT TruncType = LS.Inst->getValueType(0);
EVT LoadedType = LS.getLoadedType();
if (TruncType != LoadedType &&
!LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
ZExts = 1;
/// Account for slicing gain in the current cost.
/// Slicing provide a few gains like removing a shift or a
/// truncate. This method allows to grow the cost of the original
/// load with the gain from this slice.
void addSliceGain(const LoadedSlice &LS) {
// Each slice saves a truncate.
const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
// If there is a shift amount, this slice gets rid of it.
if (LS.Shift)
// If this slice can merge a cross register bank copy, account for it.
if (LS.canMergeExpensiveCrossRegisterBankCopy())
Cost &operator+=(const Cost &RHS) {
Loads += RHS.Loads;
Truncates += RHS.Truncates;
CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
ZExts += RHS.ZExts;
Shift += RHS.Shift;
return *this;
bool operator==(const Cost &RHS) const {
return Loads == RHS.Loads && Truncates == RHS.Truncates &&
CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
ZExts == RHS.ZExts && Shift == RHS.Shift;
bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
bool operator<(const Cost &RHS) const {
// Assume cross register banks copies are as expensive as loads.
// FIXME: Do we want some more target hooks?
unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
// Unless we are optimizing for code size, consider the
// expensive operation first.
if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
return ExpensiveOpsLHS < ExpensiveOpsRHS;
return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
(RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
bool operator>(const Cost &RHS) const { return RHS < *this; }
bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
// The last instruction that represent the slice. This should be a
// truncate instruction.
SDNode *Inst;
// The original load instruction.
LoadSDNode *Origin;
// The right shift amount in bits from the original load.
unsigned Shift;
// The DAG from which Origin came from.
// This is used to get some contextual information about legal types, etc.
SelectionDAG *DAG;
LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr,
unsigned Shift = 0, SelectionDAG *DAG = nullptr)
: Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
/// Get the bits used in a chunk of bits \p BitWidth large.
/// \return Result is \p BitWidth and has used bits set to 1 and
/// not used bits set to 0.
APInt getUsedBits() const {
// Reproduce the trunc(lshr) sequence:
// - Start from the truncated value.
// - Zero extend to the desired bit width.
// - Shift left.
assert(Origin && "No original load to compare against.");
unsigned BitWidth = Origin->getValueSizeInBits(0);
assert(Inst && "This slice is not bound to an instruction");
assert(Inst->getValueSizeInBits(0) <= BitWidth &&
"Extracted slice is bigger than the whole type!");
APInt UsedBits(Inst->getValueSizeInBits(0), 0);
UsedBits = UsedBits.zext(BitWidth);
UsedBits <<= Shift;
return UsedBits;
/// Get the size of the slice to be loaded in bytes.
unsigned getLoadedSize() const {
unsigned SliceSize = getUsedBits().countPopulation();
assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
return SliceSize / 8;
/// Get the type that will be loaded for this slice.
/// Note: This may not be the final type for the slice.
EVT getLoadedType() const {
assert(DAG && "Missing context");
LLVMContext &Ctxt = *DAG->getContext();
return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
/// Get the alignment of the load used for this slice.
Align getAlign() const {
Align Alignment = Origin->getAlign();
uint64_t Offset = getOffsetFromBase();
if (Offset != 0)
Alignment = commonAlignment(Alignment, Alignment.value() + Offset);
return Alignment;
/// Check if this slice can be rewritten with legal operations.
bool isLegal() const {
// An invalid slice is not legal.
if (!Origin || !Inst || !DAG)
return false;
// Offsets are for indexed load only, we do not handle that.
if (!Origin->getOffset().isUndef())
return false;
const TargetLowering &TLI = DAG->getTargetLoweringInfo();
// Check that the type is legal.
EVT SliceType = getLoadedType();
if (!TLI.isTypeLegal(SliceType))
return false;
// Check that the load is legal for this type.
if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
return false;
// Check that the offset can be computed.
// 1. Check its type.
EVT PtrType = Origin->getBasePtr().getValueType();
if (PtrType == MVT::Untyped || PtrType.isExtended())
return false;
// 2. Check that it fits in the immediate.
if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
return false;
// 3. Check that the computation is legal.
if (!TLI.isOperationLegal(ISD::ADD, PtrType))
return false;
// Check that the zext is legal if it needs one.
EVT TruncateType = Inst->getValueType(0);
if (TruncateType != SliceType &&
!TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
return false;
return true;
/// Get the offset in bytes of this slice in the original chunk of
/// bits.
/// \pre DAG != nullptr.
uint64_t getOffsetFromBase() const {
assert(DAG && "Missing context.");
bool IsBigEndian = DAG->getDataLayout().isBigEndian();
assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
uint64_t Offset = Shift / 8;
unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
"The size of the original loaded type is not a multiple of a"
" byte.");
// If Offset is bigger than TySizeInBytes, it means we are loading all
// zeros. This should have been optimized before in the process.
assert(TySizeInBytes > Offset &&
"Invalid shift amount for given loaded size");
if (IsBigEndian)
Offset = TySizeInBytes - Offset - getLoadedSize();
return Offset;
/// Generate the sequence of instructions to load the slice
/// represented by this object and redirect the uses of this slice to
/// this new sequence of instructions.
/// \pre this->Inst && this->Origin are valid Instructions and this
/// object passed the legal check: LoadedSlice::isLegal returned true.
/// \return The last instruction of the sequence used to load the slice.
SDValue loadSlice() const {
assert(Inst && Origin && "Unable to replace a non-existing slice.");
const SDValue &OldBaseAddr = Origin->getBasePtr();
SDValue BaseAddr = OldBaseAddr;
// Get the offset in that chunk of bytes w.r.t. the endianness.
int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
assert(Offset >= 0 && "Offset too big to fit in int64_t!");
if (Offset) {
// BaseAddr = BaseAddr + Offset.
EVT ArithType = BaseAddr.getValueType();
SDLoc DL(Origin);
BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
DAG->getConstant(Offset, DL, ArithType));
// Create the type of the loaded slice according to its size.
EVT SliceType = getLoadedType();
// Create the load for the slice.
SDValue LastInst =
DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
Origin->getPointerInfo().getWithOffset(Offset), getAlign(),
// If the final type is not the same as the loaded type, this means that
// we have to pad with zero. Create a zero extend for that.
EVT FinalType = Inst->getValueType(0);
if (SliceType != FinalType)
LastInst =
DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
return LastInst;
/// Check if this slice can be merged with an expensive cross register
/// bank copy. E.g.,
/// i = load i32
/// f = bitcast i32 i to float
bool canMergeExpensiveCrossRegisterBankCopy() const {
if (!Inst || !Inst->hasOneUse())
return false;
SDNode *Use = *Inst->use_begin();
if (Use->getOpcode() != ISD::BITCAST)
return false;
assert(DAG && "Missing context");
const TargetLowering &TLI = DAG->getTargetLoweringInfo();
EVT ResVT = Use->getValueType(0);
const TargetRegisterClass *ResRC =
TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent());
const TargetRegisterClass *ArgRC =
if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
return false;
// At this point, we know that we perform a cross-register-bank copy.
// Check if it is expensive.
const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
// Assume bitcasts are cheap, unless both register classes do not
// explicitly share a common sub class.
if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
return false;
// Check if it will be merged with the load.
// 1. Check the alignment constraint.
Align RequiredAlignment = DAG->getDataLayout().getABITypeAlign(
if (RequiredAlignment > getAlign())
return false;
// 2. Check that the load is a legal operation for that type.
if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
return false;
// 3. Check that we do not have a zext in the way.
if (Inst->getValueType(0) != getLoadedType())
return false;
return true;
} // end anonymous namespace
/// Check that all bits set in \p UsedBits form a dense region, i.e.,
/// \p UsedBits looks like 0..0 1..1 0..0.
static bool areUsedBitsDense(const APInt &UsedBits) {
// If all the bits are one, this is dense!
if (UsedBits.isAllOnesValue())
return true;
// Get rid of the unused bits on the right.
APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
// Get rid of the unused bits on the left.
if (NarrowedUsedBits.countLeadingZeros())
NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
// Check that the chunk of bits is completely used.
return NarrowedUsedBits.isAllOnesValue();
/// Check whether or not \p First and \p Second are next to each other
/// in memory. This means that there is no hole between the bits loaded
/// by \p First and the bits loaded by \p Second.
static bool areSlicesNextToEachOther(const LoadedSlice &First,
const LoadedSlice &Second) {
assert(First.Origin == Second.Origin && First.Origin &&
"Unable to match different memory origins.");
APInt UsedBits = First.getUsedBits();
assert((UsedBits & Second.getUsedBits()) == 0 &&
"Slices are not supposed to overlap.");
UsedBits |= Second.getUsedBits();
return areUsedBitsDense(UsedBits);
/// Adjust the \p GlobalLSCost according to the target
/// paring capabilities and the layout of the slices.
/// \pre \p GlobalLSCost should account for at least as many loads as
/// there is in the slices in \p LoadedSlices.
static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
LoadedSlice::Cost &GlobalLSCost) {
unsigned NumberOfSlices = LoadedSlices.size();
// If there is less than 2 elements, no pairing is possible.
if (NumberOfSlices < 2)
// Sort the slices so that elements that are likely to be next to each
// other in memory are next to each other in the list.
llvm::sort(LoadedSlices, [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
// First (resp. Second) is the first (resp. Second) potentially candidate
// to be placed in a paired load.
const LoadedSlice *First = nullptr;
const LoadedSlice *Second = nullptr;
for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
// Set the beginning of the pair.
First = Second) {
Second = &LoadedSlices[CurrSlice];
// If First is NULL, it means we start a new pair.
// Get to the next slice.
if (!First)
EVT LoadedType = First->getLoadedType();
// If the types of the slices are different, we cannot pair them.
if (LoadedType != Second->getLoadedType())
// Check if the target supplies paired loads for this type.
Align RequiredAlignment;
if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
// move to the next pair, this type is hopeless.
Second = nullptr;
// Check if we meet the alignment requirement.
if (First->getAlign() < RequiredAlignment)
// Check that both loads are next to each other in memory.
if (!areSlicesNextToEachOther(*First, *Second))
assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
// Move to the next pair.
Second = nullptr;
/// Check the profitability of all involved LoadedSlice.
/// Currently, it is considered profitable if there is exactly two
/// involved slices (1) which are (2) next to each other in memory, and
/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
/// Note: The order of the elements in \p LoadedSlices may be modified, but not
/// the elements themselves.
/// FIXME: When the cost model will be mature enough, we can relax
/// constraints (1) and (2).
static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
const APInt &UsedBits, bool ForCodeSize) {
unsigned NumberOfSlices = LoadedSlices.size();
if (StressLoadSlicing)
return NumberOfSlices > 1;
// Check (1).
if (NumberOfSlices != 2)
return false;
// Check (2).
if (!areUsedBitsDense(UsedBits))
return false;
// Check (3).
LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
// The original code has one big load.
OrigCost.Loads = 1;
for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
const LoadedSlice &LS = LoadedSlices[CurrSlice];
// Accumulate the cost of all the slices.
LoadedSlice::Cost SliceCost(LS, ForCodeSize);
GlobalSlicingCost += SliceCost;
// Account as cost in the original configuration the gain obtained
// with the current slices.
// If the target supports paired load, adjust the cost accordingly.
adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
return OrigCost > GlobalSlicingCost;
/// If the given load, \p LI, is used only by trunc or trunc(lshr)
/// operations, split it in the various pieces being extracted.
/// This sort of thing is introduced by SROA.
/// This slicing takes care not to insert overlapping loads.
/// \pre LI is a simple load (i.e., not an atomic or volatile load).
bool DAGCombiner::SliceUpLoad(SDNode *N) {
if (Level < AfterLegalizeDAG)
return false;
LoadSDNode *LD = cast<LoadSDNode>(N);
if (!LD->isSimple() || !ISD::isNormalLoad(LD) ||
return false;
// The algorithm to split up a load of a scalable vector into individual
// elements currently requires knowing the length of the loaded type,
// so will need adjusting to work on scalable vectors.
if (LD->getValueType(0).isScalableVector())
return false;
// Keep track of already used bits to detect overlapping values.
// In that case, we will just abort the transformation.
APInt UsedBits(LD->getValueSizeInBits(0), 0);
SmallVector<LoadedSlice, 4> LoadedSlices;
// Check if this load is used as several smaller chunks of bits.
// Basically, look for uses in trunc or trunc(lshr) and record a new chain
// of computation for each trunc.
for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
UI != UIEnd; ++UI) {
// Skip the uses of the chain.
if (UI.getUse().getResNo() != 0)
SDNode *User = *UI;
unsigned Shift = 0;
// Check if this is a trunc(lshr).
if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
isa<ConstantSDNode>(User->getOperand(1))) {
Shift = User->getConstantOperandVal(1);
User = *User->use_begin();
// At this point, User is a Truncate, iff we encountered, trunc or
// trunc(lshr).
if (User->getOpcode() != ISD::TRUNCATE)
return false;
// The width of the type must be a power of 2 and greater than 8-bits.
// Otherwise the load cannot be represented in LLVM IR.
// Moreover, if we shifted with a non-8-bits multiple, the slice
// will be across several bytes. We do not support that.
unsigned Width = User->getValueSizeInBits(0);
if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
return false;
// Build the slice for this chain of computations.
LoadedSlice LS(User, LD, Shift, &DAG);
APInt CurrentUsedBits = LS.getUsedBits();
// Check if this slice overlaps with another.
if ((CurrentUsedBits & UsedBits) != 0)
return false;
// Update the bits used globally.
UsedBits |= CurrentUsedBits;
// Check if the new slice would be legal.
if (!LS.isLegal())
return false;
// Record the slice.
// Abort slicing if it does not seem to be profitable.
if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
return false;
// Rewrite each chain to use an independent load.
// By construction, each chain can be represented by a unique load.
// Prepare the argument for the new token factor for all the slices.
SmallVector<SDValue, 8> ArgChains;
for (SmallVectorImpl<LoadedSlice>::const_iterator
LSIt = LoadedSlices.begin(),
LSItEnd = LoadedSlices.end();
LSIt != LSItEnd; ++LSIt) {
SDValue SliceInst = LSIt->loadSlice();
CombineTo(LSIt->Inst, SliceInst, true);
if (SliceInst.getOpcode() != ISD::LOAD)
SliceInst = SliceInst.getOperand(0);
assert(SliceInst->getOpcode() == ISD::LOAD &&
"It takes more than a zext to get to the loaded slice!!");
SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
return true;
/// Check to see if V is (and load (ptr), imm), where the load is having
/// specific bytes cleared out. If so, return the byte size being masked out
/// and the shift amount.
static std::pair<unsigned, unsigned>
CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
std::pair<unsigned, unsigned> Result(0, 0);
// Check for the structure we're looking for.
if (V->getOpcode() != ISD::AND ||
!isa<ConstantSDNode>(V->getOperand(1)) ||
return Result;
// Check the chain and pointer.
LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer.
// This only handles simple types.
if (V.getValueType() != MVT::i16 &&
V.getValueType() != MVT::i32 &&
V.getValueType() != MVT::i64)
return Result;
// Check the constant mask. Invert it so that the bits being masked out are
// 0 and the bits being kept are 1. Use getSExtValue so that leading bits
// follow the sign bit for uniformity.
uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
unsigned NotMaskLZ = countLeadingZeros(NotMask);
if (NotMaskLZ & 7) return Result; // Must be multiple of a byte.
unsigned NotMaskTZ = countTrailingZeros(NotMask);
if (NotMaskTZ & 7) return Result; // Must be multiple of a byte.
if (NotMaskLZ == 64) return Result; // All zero mask.
// See if we have a continuous run of bits. If so, we have 0*1+0*
if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
return Result;
// Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
if (V.getValueType() != MVT::i64 && NotMaskLZ)
NotMaskLZ -= 64-V.getValueSizeInBits();
unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
switch (MaskedBytes) {
case 1:
case 2:
case 4: break;
default: return Result; // All one mask, or 5-byte mask.
// Verify that the first bit starts at a multiple of mask so that the access
// is aligned the same as the access width.
if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
// For narrowing to be valid, it must be the case that the load the
// immediately preceding memory operation before the store.
if (LD == Chain.getNode())
; // ok.
else if (Chain->getOpcode() == ISD::TokenFactor &&
SDValue(LD, 1).hasOneUse()) {
// LD has only 1 chain use so they are no indirect dependencies.
if (!LD->isOperandOf(Chain.getNode()))
return Result;
} else
return Result; // Fail.
Result.first = MaskedBytes;
Result.second = NotMaskTZ/8;
return Result;
/// Check to see if IVal is something that provides a value as specified by
/// MaskInfo. If so, replace the specified store with a narrower store of
/// truncated IVal.
static SDValue
ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
SDValue IVal, StoreSDNode *St,
DAGCombiner *DC) {
unsigned NumBytes = MaskInfo.first;
unsigned ByteShift = MaskInfo.second;
SelectionDAG &DAG = DC->getDAG();
// Check to see if IVal is all zeros in the part being masked in by the 'or'
// that uses this. If not, this is not a replacement.
APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
ByteShift*8, (ByteShift+NumBytes)*8);
if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue();
// Check that it is legal on the target to do this. It is legal if the new
// VT we're shrinking to (i8/i16/i32) is legal or we're still before type
// legalization (and the target doesn't explicitly think this is a bad idea).
MVT VT = MVT::getIntegerVT(NumBytes * 8);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!DC->isTypeLegal(VT))
return SDValue();
if (St->getMemOperand() &&
!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
return SDValue();
// Okay, we can do this! Replace the 'St' store with a store of IVal that is
// shifted by ByteShift and truncated down to NumBytes.
if (ByteShift) {
SDLoc DL(IVal);
IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
DAG.getConstant(ByteShift*8, DL,
// Figure out the offset for the store and the alignment of the access.
unsigned StOffset;
unsigned NewAlign = St->getAlignment();
if (DAG.getDataLayout().isLittleEndian())
StOffset = ByteShift;
StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
SDValue Ptr = St->getBasePtr();
if (StOffset) {
SDLoc DL(IVal);
Ptr = DAG.getMemBasePlusOffset(Ptr, StOffset, DL);
NewAlign = MinAlign(NewAlign, StOffset);
// Truncate down to the new size.
IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
return DAG
.getStore(St->getChain(), SDLoc(St), IVal, Ptr,
St->getPointerInfo().getWithOffset(StOffset), NewAlign);
/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
/// narrowing the load and store if it would end up being a win for performance
/// or code size.
SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
StoreSDNode *ST = cast<StoreSDNode>(N);
if (!ST->isSimple())
return SDValue();
SDValue Chain = ST->getChain();
SDValue Value = ST->getValue();
SDValue Ptr = ST->getBasePtr();
EVT VT = Value.getValueType();
if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
return SDValue();
unsigned Opc = Value.getOpcode();
// If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
// is a byte mask indicating a consecutive number of bytes, check to see if
// Y is known to provide just those bytes. If so, we try to replace the
// load + replace + store sequence with a single (narrower) store, which makes
// the load dead.
if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) {
std::pair<unsigned, unsigned> MaskedLoad;
MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
if (MaskedLoad.first)
if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
Value.getOperand(1), ST,this))
return NewST;
// Or is commutative, so try swapping X and Y.
MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
if (MaskedLoad.first)
if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
Value.getOperand(0), ST,this))
return NewST;
if (!EnableReduceLoadOpStoreWidth)
return SDValue();
if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
Value.getOperand(1).getOpcode() != ISD::Constant)
return SDValue();
SDValue N0 = Value.getOperand(0);
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
Chain == SDValue(N0.getNode(), 1)) {
LoadSDNode *LD = cast<LoadSDNode>(N0);
if (LD->getBasePtr() != Ptr ||
LD->getPointerInfo().getAddrSpace() !=
return SDValue();
// Find the type to narrow it the load / op / store to.
SDValue N1 = Value.getOperand(1);
unsigned BitWidth = N1.getValueSizeInBits();
APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
if (Opc == ISD::AND)
Imm ^= APInt::getAllOnesValue(BitWidth);
if (Imm == 0 || Imm.isAllOnesValue())
return SDValue();
unsigned ShAmt = Imm.countTrailingZeros();
unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
unsigned NewBW = NextPowerOf2(MSB - ShAmt);
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
// The narrowing should be profitable, the load/store operation should be
// legal (or custom) and the store size should be equal to the NewVT width.
while (NewBW < BitWidth &&
(NewVT.getStoreSizeInBits() != NewBW ||
!TLI.isOperationLegalOrCustom(Opc, NewVT) ||
!TLI.isNarrowingProfitable(VT, NewVT))) {
NewBW = NextPowerOf2(NewBW);
NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
if (NewBW >= BitWidth)
return SDValue();
// If the lsb changed does not start at the type bitwidth boundary,
// start at the previous one.
if (ShAmt % NewBW)
ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
std::min(BitWidth, ShAmt + NewBW));
if ((Imm & Mask) == Imm) {
APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
if (Opc == ISD::AND)
NewImm ^= APInt::getAllOnesValue(NewBW);
uint64_t PtrOff = ShAmt / 8;
// For big endian targets, we need to adjust the offset to the pointer to
// load the correct bytes.
if (DAG.getDataLayout().isBigEndian())
PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy))
return SDValue();
SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD));
SDValue NewLD =
DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
DAG.getConstant(NewImm, SDLoc(Value),
SDValue NewST =
DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
return NewST;
return SDValue();
/// For a given floating point load / store pair, if the load value isn't used
/// by any other operations, then consider transforming the pair to integer
/// load / store operations if the target deems the transformation profitable.
SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
StoreSDNode *ST = cast<StoreSDNode>(N);
SDValue Value = ST->getValue();
if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
Value.hasOneUse()) {
LoadSDNode *LD = cast<LoadSDNode>(Value);
EVT VT = LD->getMemoryVT();
if (!VT.isFloatingPoint() ||
VT != ST->getMemoryVT() ||
LD->isNonTemporal() ||
ST->isNonTemporal() ||
LD->getPointerInfo().getAddrSpace() != 0 ||
ST->getPointerInfo().getAddrSpace() != 0)
return SDValue();
TypeSize VTSize = VT.getSizeInBits();
// We don't know the size of scalable types at compile time so we cannot
// create an integer of the equivalent size.
if (VTSize.isScalable())
return SDValue();
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize());
if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
!TLI.isOperationLegal(ISD::STORE, IntVT) ||
!TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
!TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
return SDValue();
Align LDAlign = LD->getAlign();
Align STAlign = ST->getAlign();
Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
Align ABIAlign = DAG.getDataLayout().getABITypeAlign(IntVTTy);
if (LDAlign < ABIAlign || STAlign < ABIAlign)
return SDValue();
SDValue NewLD =
DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
LD->getPointerInfo(), LDAlign);
SDValue NewST =
DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
ST->getPointerInfo(), STAlign);
WorklistRemover DeadNodes(*this);
DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
return NewST;
return SDValue();
// This is a helper function for visitMUL to check the profitability
// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
// MulNode is the original multiply, AddNode is (add x, c1),
// and ConstNode is c2.
// If the (add x, c1) has multiple uses, we could increase
// the number of adds if we make this transformation.
// It would only be worth doing this if we can remove a
// multiply in the process. Check for that here.
// To illustrate:
// (A + c1) * c3
// (A + c2) * c3
// We're checking for cases where we have common "c3 * A" expressions.
bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
SDValue &AddNode,
SDValue &ConstNode) {
APInt Val;
// If the add only has one use, this would be OK to do.
if (AddNode.getNode()->hasOneUse())
return true;
// Walk all the users of the constant with which we're multiplying.
for (SDNode *Use : ConstNode->uses()) {
if (Use == MulNode) // This use is the one we're on right now. Skip it.
if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
SDNode *OtherOp;
SDNode *MulVar = AddNode.getOperand(0).getNode();
// OtherOp is what we're multiplying against the constant.
if (Use->getOperand(0) == ConstNode)
OtherOp = Use->getOperand(1).getNode();
OtherOp = Use->getOperand(0).getNode();
// Check to see if multiply is with the same operand of our "add".
// ConstNode = CONST
// Use = ConstNode * A <-- visiting Use. OtherOp is A.
// ...
// AddNode = (A + c1) <-- MulVar is A.
// = AddNode * ConstNode <-- current visiting instruction.
// If we make this transformation, we will have a common
// multiply (ConstNode * A) that we can save.
if (OtherOp == MulVar)
return true;
// Now check to see if a future expansion will give us a common
// multiply.
// ConstNode = CONST
// AddNode = (A + c1)
// ... = AddNode * ConstNode <-- current visiting instruction.
// ...
// OtherOp = (A + c2)
// Use = OtherOp * ConstNode <-- visiting Use.
// If we make this transformation, we will have a common
// multiply (CONST * A) after we also do the same transformation
// to the "t2" instruction.
if (OtherOp->getOpcode() == ISD::ADD &&
DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
OtherOp->getOperand(0).getNode() == MulVar)
return true;
// Didn't find a case where this would be profitable.
return false;
SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
unsigned NumStores) {
SmallVector<SDValue, 8> Chains;
SmallPtrSet<const SDNode *, 8> Visited;
SDLoc StoreDL(StoreNodes[0].MemNode);
for (unsigned i = 0; i < NumStores; ++i) {
// don't include nodes that are children or repeated nodes.
for (unsigned i = 0; i < NumStores; ++i) {
if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second)
assert(Chains.size() > 0 && "Chain should have generated a chain");
return DAG.getTokenFactor(StoreDL, Chains);
bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
bool IsConstantSrc, bool UseVector, bool UseTrunc) {
// Make sure we have something to merge.
if (NumStores < 2)
return false;
// The latest Node in the DAG.
SDLoc DL(StoreNodes[0].MemNode);
TypeSize ElementSizeBits = MemVT.getStoreSizeInBits();
unsigned SizeInBits = NumStores * ElementSizeBits;
unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
EVT StoreTy;
if (UseVector) {
unsigned Elts = NumStores * NumMemElts;
// Get the type for the merged vector store.
StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
} else
StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
SDValue StoredVal;
if (UseVector) {
if (IsConstantSrc) {
SmallVector<SDValue, 8> BuildVector;
for (unsigned I = 0; I != NumStores; ++I) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
SDValue Val = St->getValue();
// If constant is of the wrong type, convert it now.
if (MemVT != Val.getValueType()) {
Val = peekThroughBitcasts(Val);
// Deal with constants of wrong size.
if (ElementSizeBits != Val.getValueSizeInBits()) {
EVT IntMemVT =
EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
if (isa<ConstantFPSDNode>(Val)) {
// Not clear how to truncate FP values.
return false;
} else if (auto *C = dyn_cast<ConstantSDNode>(Val))
Val = DAG.getConstant(C->getAPIntValue()
SDLoc(C), IntMemVT);
// Make sure correctly size type is the correct type.
Val = DAG.getBitcast(MemVT, Val);
StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
DL, StoreTy, BuildVector);
} else {
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i < NumStores; ++i) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
SDValue Val = peekThroughBitcasts(St->getValue());
// All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
// type MemVT. If the underlying value is not the correct
// type, but it is an extraction of an appropriate vector we
// can recast Val to be of the correct type. This may require
// converting between EXTRACT_VECTOR_ELT and
if ((MemVT != Val.getValueType()) &&
(Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
EVT MemVTScalarTy = MemVT.getScalarType();
// We may need to add a bitcast here to get types to line up.
if (MemVTScalarTy != Val.getValueType().getScalarType()) {
Val = DAG.getBitcast(MemVT, Val);
} else {
unsigned OpC = MemVT.isVector() ? ISD::EXTRACT_SUBVECTOR
SDValue Vec = Val.getOperand(0);
SDValue Idx = Val.getOperand(1);
Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx);
// Build the extracted vector elements back into a vector.
StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
DL, StoreTy, Ops);
} else {
// We should always use a vector store when merging extracted vector
// elements, so this path implies a store of constants.
assert(IsConstantSrc && "Merged vector elements should use vector store");
APInt StoreInt(SizeInBits, 0);
// Construct a single integer constant which is made of the smaller
// constant inputs.
bool IsLE = DAG.getDataLayout().isLittleEndian();
for (unsigned i = 0; i < NumStores; ++i) {
unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
SDValue Val = St->getValue();
Val = peekThroughBitcasts(Val);
StoreInt <<= ElementSizeBits;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
StoreInt |= C->getAPIntValue()
} else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
StoreInt |= C->getValueAPF()
// If fp truncation is necessary give up for now.
if (MemVT.getSizeInBits() != ElementSizeBits)
return false;
} else {
llvm_unreachable("Invalid constant element type");
// Create the new Load and Store operations.
StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
// make sure we use trunc store if it's necessary to be legal.
SDValue NewStore;
if (!UseTrunc) {
NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
} else { // Must be realized as a trunc store
EVT LegalizedStoredValTy =
TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits();
ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
SDValue ExtendedStoreVal =
DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
NewStore = DAG.getTruncStore(
NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
// Replace all merged stores with the new store.
for (unsigned i = 0; i < NumStores; ++i)
CombineTo(StoreNodes[i].MemNode, NewStore);
return true;
void DAGCombiner::getStoreMergeCandidates(
StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
SDNode *&RootNode) {
// This holds the base pointer, index, and the offset in bytes from the base
// pointer.
BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
EVT MemVT = St->getMemoryVT();
SDValue Val = peekThroughBitcasts(St->getValue());
// We must have a base and an offset.
if (!BasePtr.getBase().getNode())
// Do not handle stores to undef base pointers.
if (BasePtr.getBase().isUndef())
StoreSource StoreSrc = getStoreSource(Val);
assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");
BaseIndexOffset LBasePtr;
// Match on loadbaseptr if relevant.
if (StoreSrc == StoreSource::Load) {
auto *Ld = cast<LoadSDNode>(Val);
LBasePtr = BaseIndexOffset::match(Ld, DAG);
LoadVT = Ld->getMemoryVT();
// Load and store should be the same type.
if (MemVT != LoadVT)
// Loads must only have one use.
if (!Ld->hasNUsesOfValue(1, 0))
// The memory operands must not be volatile/indexed/atomic.
// TODO: May be able to relax for unordered atomics (see D66309)
if (!Ld->isSimple() || Ld->isIndexed())
auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
int64_t &Offset) -> bool {
// The memory operands must not be volatile/indexed/atomic.
// TODO: May be able to relax for unordered atomics (see D66309)
if (!Other->isSimple() || Other->isIndexed())
return false;
// Don't mix temporal stores with non-temporal stores.
if (St->isNonTemporal() != Other->isNonTemporal())
return false;
SDValue OtherBC = peekThroughBitcasts(Other->getValue());
// Allow merging constants of different types as integers.
bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
: Other->getMemoryVT() != MemVT;
if (StoreSrc == StoreSource::Load) {
if (NoTypeMatch)
return false;
// The Load's Base Ptr must also match
if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(OtherBC)) {
BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG);
if (LoadVT != OtherLd->getMemoryVT())
return false;
// Loads must only have one use.
if (!OtherLd->hasNUsesOfValue(1, 0))
return false;
// The memory operands must not be volatile/indexed/atomic.
// TODO: May be able to relax for unordered atomics (see D66309)
if (!OtherLd->isSimple() ||
return false;
// Don't mix temporal loads with non-temporal loads.
if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
return false;
if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
return false;
} else
return false;
if (StoreSrc == StoreSource::Constant) {
if (NoTypeMatch)
return false;
if (!(isa<ConstantSDNode>(OtherBC) || isa<ConstantFPSDNode>(OtherBC)))
return false;
if (StoreSrc == StoreSource::Extract) {
// Do not merge truncated stores here.
if (Other->isTruncatingStore())
return false;
if (!MemVT.bitsEq(OtherBC.getValueType()))
return false;
if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;
Ptr = BaseIndexOffset::match(Other, DAG);
return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
// Check if the pair of StoreNode and the RootNode already bail out many
// times which is over the limit in dependence check.
auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
SDNode *RootNode) -> bool {
auto RootCount = StoreRootCountMap.find(StoreNode);
if (RootCount != StoreRootCountMap.end() &&
RootCount->second.first == RootNode &&
RootCount->second.second > StoreMergeDependenceLimit)
return true;
return false;
// We looking for a root node which is an ancestor to all mergable
// stores. We search up through a load, to our root and then down
// through all children. For instance we will find Store{1,2,3} if
// St is Store1, Store2. or Store3 where the root is not a load
// which always true for nonvolatile ops. TODO: Expand
// the search to find all valid candidates through multiple layers of loads.
// Root
// |-------|-------|
// Load Load Store3
// | |
// Store1 Store2
// FIXME: We should be able to climb and
// descend TokenFactors to find candidates as well.
RootNode = St->getChain().getNode();
unsigned NumNodesExplored = 0;
if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
RootNode = Ldn->getChain().getNode();
for (auto I = RootNode->use_begin(), E = RootNode->use_end();
I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored)
if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
if (I2.getOperandNo() == 0)
if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) {
BaseIndexOffset Ptr;
int64_t PtrDiff;
if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
!OverLimitInDependenceCheck(OtherST, RootNode))
StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
} else
for (auto I = RootNode->use_begin(), E = RootNode->use_end();
I != E && NumNodesExplored < 1024; ++I, ++NumNodesExplored)
if (I.getOperandNo() == 0)
if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
BaseIndexOffset Ptr;
int64_t PtrDiff;
if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
!OverLimitInDependenceCheck(OtherST, RootNode))
StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
// We need to check that merging these stores does not cause a loop in
// the DAG. Any store candidate may depend on another candidate
// indirectly through its operand (we already consider dependencies
// through the chain). Check in parallel by searching up from
// non-chain operands of candidates.
bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
SDNode *RootNode) {
// FIXME: We should be able to truncate a full search of
// predecessors by doing a BFS and keeping tabs the originating
// stores from which worklist nodes come from in a similar way to
// TokenFactor simplfication.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 8> Worklist;
// RootNode is a predecessor to all candidates so we need not search
// past it. Add RootNode (peeking through TokenFactors). Do not count
// these towards size check.
while (!Worklist.empty()) {
auto N = Worklist.pop_back_val();
if (!Visited.insert(N).second)
continue; // Already present in Visited.
if (N->getOpcode() == ISD::TokenFactor) {
for (SDValue Op : N->ops())
// Don't count pruning nodes towards max.
unsigned int Max = 1024 + Visited.size();
// Search Ops of store candidates.
for (unsigned i = 0; i < NumStores; ++i) {
SDNode *N = StoreNodes[i].MemNode;
// Of the 4 Store Operands:
// * Chain (Op 0) -> We have already considered these
// in candidate selection and can be
// safely ignored
// * Value (Op 1) -> Cycles may happen (e.g. through load chains)
// * Address (Op 2) -> Merged addresses may only vary by a fixed constant,
// but aren't necessarily fromt the same base node, so
// cycles possible (e.g. via indexed store).
// * (Op 3) -> Represents the pre or post-indexing offset (or undef for
// non-indexed stores). Not constant on all targets (e.g. ARM)
// and so can participate in a cycle.
for (unsigned j = 1; j < N->getNumOperands(); ++j)
// Search through DAG. We can stop early if we find a store node.
for (unsigned i = 0; i < NumStores; ++i)
if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
Max)) {
// If the searching bail out, record the StoreNode and RootNode in the
// StoreRootCountMap. If we have seen the pair many times over a limit,
// we won't add the StoreNode into StoreNodes set again.
if (Visited.size() >= Max) {
auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode];
if (RootCount.first == RootNode)
RootCount = {RootNode, 1};
return false;
return true;
DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
int64_t ElementSizeBytes) const {
while (true) {
// Find a store past the width of the first store.
size_t StartIdx = 0;
while ((StartIdx + 1 < StoreNodes.size()) &&
StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
StoreNodes[StartIdx + 1].OffsetFromBase)
// Bail if we don't have enough candidates to merge.
if (StartIdx + 1 >= StoreNodes.size())
return 0;
// Trim stores that overlapped with the first store.
if (StartIdx)
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
// Scan the memory operations on the chain and find the first
// non-consecutive store memory address.
unsigned NumConsecutiveStores = 1;
int64_t StartAddress = StoreNodes[0].OffsetFromBase;
// Check that the addresses are consecutive starting from the second
// element in the list of stores.
for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
if (CurrAddress - StartAddress != (ElementSizeBytes * i))
NumConsecutiveStores = i + 1;
if (NumConsecutiveStores > 1)
return NumConsecutiveStores;
// There are no consecutive stores at the start of the list.
// Remove the first store and try again.
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
bool DAGCombiner::tryStoreMergeOfConstants(
SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
EVT MemVT, SDNode *RootNode, bool AllowVectors) {
LLVMContext &Context = *DAG.getContext();
const DataLayout &DL = DAG.getDataLayout();
int64_t ElementSizeBytes = MemVT.getStoreSize();
unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
bool MadeChange = false;
// Store the constants into memory as one consecutive store.
while (NumConsecutiveStores >= 2) {
LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
unsigned FirstStoreAS = FirstInChain->getAddressSpace();
unsigned FirstStoreAlign = FirstInChain->getAlignment();
unsigned LastLegalType = 1;
unsigned LastLegalVectorType = 1;
bool LastIntegerTrunc = false;
bool NonZero = false;
unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
SDValue StoredVal = ST->getValue();
bool IsElementZero = false;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
IsElementZero = C->isNullValue();
else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
IsElementZero = C->getConstantFPValue()->isNullValue();
if (IsElementZero) {
if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
FirstZeroAfterNonZero = i;
NonZero |= !IsElementZero;
// Find a legal type for the constant store.
unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
bool IsFast = false;
// Break early when size is too large to be legal.
if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
if (TLI.isTypeLegal(StoreTy) &&
TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
TLI.allowsMemoryAccess(Context, DL, StoreTy,
*FirstInChain->getMemOperand(), &IsFast) &&
IsFast) {
LastIntegerTrunc = false;
LastLegalType = i + 1;
// Or check whether a truncstore is legal.
} else if (TLI.getTypeAction(Context, StoreTy) ==
TargetLowering::TypePromoteInteger) {
EVT LegalizedStoredValTy =
TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
TLI.allowsMemoryAccess(Context, DL, StoreTy,
*FirstInChain->getMemOperand(), &IsFast) &&
IsFast) {
LastIntegerTrunc = true;
LastLegalType = i + 1;
// We only use vectors if the constant is known to be zero or the
// target allows it and the function is not marked with the
// noimplicitfloat attribute.
if ((!NonZero ||
TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
AllowVectors) {
// Find a legal type for the vector store.
unsigned Elts = (i + 1) * NumMemElts;
EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
TLI.allowsMemoryAccess(Context, DL, Ty,
*FirstInChain->getMemOperand(), &IsFast) &&
LastLegalVectorType = i + 1;
bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors;
unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
// Check if we found a legal integer type that creates a meaningful
// merge.
if (NumElem < 2) {
// We know that candidate stores are in order and of correct
// shape. While there is no mergeable sequence from the
// beginning one may start later in the sequence. The only
// reason a merge of size N could have failed where another of
// the same size would not have, is if the alignment has
// improved or we've dropped a non-zero value. Drop as many
// candidates as we can here.
unsigned NumSkip = 1;
while ((NumSkip < NumConsecutiveStores) &&
(NumSkip < FirstZeroAfterNonZero) &&
(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
NumConsecutiveStores -= NumSkip;
// Check that we can merge these candidates without causing a cycle.
if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
RootNode)) {
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
NumConsecutiveStores -= NumElem;
MadeChange |= mergeStoresOfConstantsOrVecElts(
StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);
// Remove merged stores for next iteration.
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
NumConsecutiveStores -= NumElem;
return MadeChange;
bool DAGCombiner::tryStoreMergeOfExtracts(
SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
EVT MemVT, SDNode *RootNode) {
LLVMContext &Context = *DAG.getContext();
const DataLayout &DL = DAG.getDataLayout();
unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
bool MadeChange = false;
// Loop on Consecutive Stores on success.
while (NumConsecutiveStores >= 2) {
LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
unsigned FirstStoreAS = FirstInChain->getAddressSpace();
unsigned FirstStoreAlign = FirstInChain->getAlignment();
unsigned NumStoresToMerge = 1;
for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
// Find a legal type for the vector store.
unsigned Elts = (i + 1) * NumMemElts;
EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
bool IsFast = false;
// Break early when size is too large to be legal.
if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
TLI.allowsMemoryAccess(Context, DL, Ty,
*FirstInChain->getMemOperand(), &IsFast) &&
NumStoresToMerge = i + 1;
// Check if we found a legal integer type creating a meaningful
// merge.
if (NumStoresToMerge < 2) {
// We know that candidate stores are in order and of correct
// shape. While there is no mergeable sequence from the
// beginning one may start later in the sequence. The only
// reason a merge of size N could have failed where another of
// the same size would not have, is if the alignment has
// improved. Drop as many candidates as we can here.
unsigned NumSkip = 1;
while ((NumSkip < NumConsecutiveStores) &&
(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
NumConsecutiveStores -= NumSkip;
// Check that we can merge these candidates without causing a cycle.
if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge,
RootNode)) {
StoreNodes.begin() + NumStoresToMerge);
NumConsecutiveStores -= NumStoresToMerge;
MadeChange |= mergeStoresOfConstantsOrVecElts(
StoreNodes, MemVT, NumStoresToMerge, false, true, false);
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge);
NumConsecutiveStores -= NumStoresToMerge;
return MadeChange;
bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
unsigned NumConsecutiveStores, EVT MemVT,
SDNode *RootNode, bool AllowVectors,
bool IsNonTemporalStore,
bool IsNonTemporalLoad) {
LLVMContext &Context = *DAG.getContext();
const DataLayout &DL = DAG.getDataLayout();
int64_t ElementSizeBytes = MemVT.getStoreSize();
unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
bool MadeChange = false;
int64_t StartAddress = StoreNodes[0].OffsetFromBase;
// Look for load nodes which are used by the stored values.
SmallVector<MemOpLink, 8> LoadNodes;
// Find acceptable loads. Loads need to have the same chain (token factor),
// must not be zext, volatile, indexed, and they must be consecutive.
BaseIndexOffset LdBasePtr;
for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
SDValue Val = peekThroughBitcasts(St->getValue());
LoadSDNode *Ld = cast<LoadSDNode>(Val);
BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
// If this is not the first ptr that we check.
int64_t LdOffset = 0;
if (LdBasePtr.getBase().getNode()) {
// The base ptr must be the same.
if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
} else {
// Check that all other base pointers are the same as this one.
LdBasePtr = LdPtr;
// We found a potential memory operand to merge.
LoadNodes.push_back(MemOpLink(Ld, LdOffset));
while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
Align RequiredAlignment;
bool NeedRotate = false;
if (LoadNodes.size() == 2) {
// If we have load/store pair instructions and we only have two values,
// don't bother merging.
if (TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) {
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);
// If the loads are reversed, see if we can rotate the halves into place.
int64_t Offset0 = LoadNodes[0].OffsetFromBase;
int64_t Offset1 = LoadNodes[1].OffsetFromBase;
EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2);
if (Offset0 - Offset1 == ElementSizeBytes &&
(hasOperation(ISD::ROTL, PairVT) ||
hasOperation(ISD::ROTR, PairVT))) {
std::swap(LoadNodes[0], LoadNodes[1]);
NeedRotate = true;
LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
unsigned FirstStoreAS = FirstInChain->getAddressSpace();
unsigned FirstStoreAlign = FirstInChain->getAlignment();
LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
// Scan the memory operations on the chain and find the first
// non-consecutive load memory address. These variables hold the index in
// the store node array.
unsigned LastConsecutiveLoad = 1;
// This variable refers to the size and not index in the array.
unsigned LastLegalVectorType = 1;
unsigned LastLegalIntegerType = 1;
bool isDereferenceable = true;
bool DoIntegerTruncate = false;
StartAddress = LoadNodes[0].OffsetFromBase;
SDValue LoadChain = FirstLoad->getChain();
for (unsigned i = 1; i < LoadNodes.size(); ++i) {
// All loads must share the same chain.
if (LoadNodes[i].MemNode->getChain() != LoadChain)
int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
if (CurrAddress - StartAddress != (ElementSizeBytes * i))
LastConsecutiveLoad = i;
if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
isDereferenceable = false;
// Find a legal type for the vector store.
unsigned Elts = (i + 1) * NumMemElts;
EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
// Break early when size is too large to be legal.
if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
bool IsFastSt = false;
bool IsFastLd = false;
if (TLI.isTypeLegal(StoreTy) &&
TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
TLI.allowsMemoryAccess(Context, DL, StoreTy,
*FirstInChain->getMemOperand(), &IsFastSt) &&
IsFastSt &&
TLI.allowsMemoryAccess(Context, DL, StoreTy,
*FirstLoad->getMemOperand(), &IsFastLd) &&
IsFastLd) {
LastLegalVectorType = i + 1;
// Find a legal type for the integer store.
unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
StoreTy = EVT::getIntegerVT(Context, SizeInBits);
if (TLI.isTypeLegal(StoreTy) &&
TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
TLI.allowsMemoryAccess(Context, DL, StoreTy,
*FirstInChain->getMemOperand(), &IsFastSt) &&
IsFastSt &&
TLI.allowsMemoryAccess(Context, DL, StoreTy,
*FirstLoad->getMemOperand(), &IsFastLd) &&
IsFastLd) {
LastLegalIntegerType = i + 1;
DoIntegerTruncate = false;
// Or check whether a truncstore and extload is legal.
} else if (TLI.getTypeAction(Context, StoreTy) ==
TargetLowering::TypePromoteInteger) {
EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) &&
TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) &&
TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
TLI.allowsMemoryAccess(Context, DL, StoreTy,
*FirstInChain->getMemOperand(), &IsFastSt) &&
IsFastSt &&
TLI.allowsMemoryAccess(Context, DL, StoreTy,
*FirstLoad->getMemOperand(), &IsFastLd) &&
IsFastLd) {
LastLegalIntegerType = i + 1;
DoIntegerTruncate = true;
// Only use vector types if the vector type is larger than the integer
// type. If they are the same, use integers.
bool UseVectorTy =
LastLegalVectorType > LastLegalIntegerType && AllowVectors;
unsigned LastLegalType =
std::max(LastLegalVectorType, LastLegalIntegerType);
// We add +1 here because the LastXXX variables refer to location while
// the NumElem refers to array/index size.
unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
NumElem = std::min(LastLegalType, NumElem);
unsigned FirstLoadAlign = FirstLoad->getAlignment();
if (NumElem < 2) {
// We know that candidate stores are in order and of correct
// shape. While there is no mergeable sequence from the
// beginning one may start later in the sequence. The only
// reason a merge of size N could have failed where another of
// the same size would not have is if the alignment or either
// the load or store has improved. Drop as many candidates as we
// can here.
unsigned NumSkip = 1;
while ((NumSkip < LoadNodes.size()) &&
(LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
NumConsecutiveStores -= NumSkip;
// Check that we can merge these candidates without causing a cycle.
if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
RootNode)) {
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
NumConsecutiveStores -= NumElem;
// Find if it is better to use vectors or integers to load and store
// to memory.
EVT JointMemOpVT;
if (UseVectorTy) {
// Find a legal type for the vector store.
unsigned Elts = NumElem * NumMemElts;
JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
} else {
unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
SDLoc LoadDL(LoadNodes[0].MemNode);
SDLoc StoreDL(StoreNodes[0].MemNode);
// The merged loads are required to have the same incoming chain, so
// using the first's chain is acceptable.
SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
MachineMemOperand::Flags LdMMOFlags =
isDereferenceable ? MachineMemOperand::MODereferenceable
: MachineMemOperand::MONone;
if (IsNonTemporalLoad)
LdMMOFlags |= MachineMemOperand::MONonTemporal;
MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore
? MachineMemOperand::MONonTemporal
: MachineMemOperand::MONone;
SDValue NewLoad, NewStore;
if (UseVectorTy || !DoIntegerTruncate) {
NewLoad = DAG.getLoad(
JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags);
SDValue StoreOp = NewLoad;
if (NeedRotate) {
unsigned LoadWidth = ElementSizeBytes * 8 * 2;
assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) &&
"Unexpected type for rotate-able load pair");
SDValue RotAmt =
DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL);
// Target can convert to the identical ROTR if it does not have ROTL.
StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt);
NewStore = DAG.getStore(
NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(),
FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);
} else { // This must be the truncstore/extload case
EVT ExtendedTy =
TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
FirstLoad->getChain(), FirstLoad->getBasePtr(),
FirstLoad->getPointerInfo(), JointMemOpVT,
FirstLoadAlign, LdMMOFlags);
NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
FirstInChain->getPointerInfo(), JointMemOpVT,
// Transfer chain users from old loads to the new load.
for (unsigned i = 0; i < NumElem; ++i) {
LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
SDValue(NewLoad.getNode(), 1));
// Replace all stores with the new store. Recursively remove corresponding
// values if they are no longer used.
for (unsigned i = 0; i < NumElem; ++i) {
SDValue Val = StoreNodes[i].MemNode->getOperand(1);
CombineTo(StoreNodes[i].MemNode, NewStore);
if (Val.getNode()->use_empty())
MadeChange = true;
StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
NumConsecutiveStores -= NumElem;
return MadeChange;
bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {
if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)
return false;
// TODO: Extend this function to merge stores of scalable vectors.
// (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8>
// store since we know <vscale x 16 x i8> is exactly twice as large as
// <vscale x 8 x i8>). Until then, bail out for scalable vectors.
EVT MemVT = St->getMemoryVT();
if (MemVT.isScalableVector())
return false;
if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
return false;
// This function cannot currently deal with non-byte-sized memory sizes.
int64_t ElementSizeBytes = MemVT.getStoreSize();
if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())
return false;
// Do not bother looking at stored values that are not constants, loads, or
// extracted vector elements.
SDValue StoredVal = peekThroughBitcasts(St->getValue());
const StoreSource StoreSrc = getStoreSource(StoredVal);
if (StoreSrc == StoreSource::Unknown)
return false;
SmallVector<MemOpLink, 8> StoreNodes;
SDNode *RootNode;
// Find potential store merge candidates by searching through chain sub-DAG
getStoreMergeCandidates(St, StoreNodes, RootNode);
// Check if there is anything to merge.
if (StoreNodes.size() < 2)
return false;
// Sort the memory operands according to their distance from the
// base pointer.
llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {
return LHS.OffsetFromBase < RHS.OffsetFromBase;
bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute(
bool IsNonTemporalStore = St->isNonTemporal();
bool IsNonTemporalLoad = StoreSrc == StoreSource::Load &&
// Store Merge attempts to merge the lowest stores. This generally
// works out as if successful, as the remaining stores are checked
// after the first collection of stores is merged. However, in the
// case that a non-mergeable store is found first, e.g., {p[-2],
// p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
// mergeable cases. To prevent this, we prune such stores from the
// front of StoreNodes here.
bool MadeChange = false;
while (StoreNodes.size() > 1) {
unsigned NumConsecutiveStores =
getConsecutiveStores(StoreNodes, ElementSizeBytes);
// There are no more stores in the list to examine.
if (NumConsecutiveStores == 0)
return MadeChange;
// We have at least 2 consecutive stores. Try to merge them.
assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores");
switch (StoreSrc) {
case StoreSource::Constant:
MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores,
MemVT, RootNode, AllowVectors);
case StoreSource::Extract:
MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores,
MemVT, RootNode);
case StoreSource::Load:
MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores,
MemVT, RootNode, AllowVectors,
IsNonTemporalStore, IsNonTemporalLoad);
llvm_unreachable("Unhandled store source type");
return MadeChange;
SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
SDValue ReplStore;
// Replace the chain to avoid dependency.
if (ST->isTruncatingStore()) {
ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
ST->getBasePtr(), ST->getMemoryVT(),
} else {
ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
// Create token to keep both nodes around.
SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
MVT::Other, ST->getChain(), ReplStore);
// Make sure the new and old chains are cleaned up.
// Don't add users to work list.
return CombineTo(ST, Token, false);
SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
SDValue Value = ST->getValue();
if (Value.getOpcode() == ISD::TargetConstantFP)
return SDValue();
if (!ISD::isNormalStore(ST))
return SDValue();
SDValue Chain = ST->getChain();
SDValue Ptr = ST->getBasePtr();
const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);
// NOTE: If the original store is volatile, this transform must not increase
// the number of stores. For example, on x86-32 an f64 can be stored in one
// processor operation but an i64 (which is not legal) requires two. So the
// transform should not be done in this case.
SDValue Tmp;
switch (CFP->getSimpleValueType(0).SimpleTy) {
llvm_unreachable("Unknown FP type");
case MVT::f16: // We don't do this for these yet.
case MVT::f80:
case MVT::f128:
case MVT::ppcf128:
return SDValue();
case MVT::f32:
if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) ||
TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
bitcastToAPInt().getZExtValue(), SDLoc(CFP),
return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
return SDValue();
case MVT::f64:
if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
ST->isSimple()) ||
TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
getZExtValue(), SDLoc(CFP), MVT::i64);
return DAG.getStore(Chain, DL, Tmp,
Ptr, ST->getMemOperand());
if (ST->isSimple() &&
TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
// Many FP stores are not made apparent until after legalize, e.g. for
// argument passing. Since this is so common, custom legalize the
// 64-bit integer store into two 32-bit stores.
uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
if (DAG.getDataLayout().isBigEndian())
std::swap(Lo, Hi);
unsigned Alignment = ST->getAlignment();
MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
AAMDNodes AAInfo = ST->getAAInfo();
SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
ST->getAlignment(), MMOFlags, AAInfo);
Ptr = DAG.getMemBasePlusOffset(Ptr, 4, DL);
Alignment = MinAlign(Alignment, 4U);
SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
Alignment, MMOFlags, AAInfo);
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
St0, St1);
return SDValue();
SDValue DAGCombiner::visitSTORE(SDNode *N) {
StoreSDNode *ST = cast<StoreSDNode>(N);
SDValue Chain = ST->getChain();
SDValue Value = ST->getValue();
SDValue Ptr = ST->getBasePtr();
// If this is a store of a bit convert, store the input value if the
// resultant store does not need a higher alignment than the original.
if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
ST->isUnindexed()) {
EVT SVT = Value.getOperand(0).getValueType();
// If the store is volatile, we only want to change the store type if the
// resulting store is legal. Otherwise we might increase the number of
// memory accesses. We don't care if the original type was legal or not
// as we assume software couldn't rely on the number of accesses of an
// illegal type.
// TODO: May be able to relax for unordered atomics (see D66309)
if (((!LegalOperations && ST->isSimple()) ||
TLI.isOperationLegal(ISD::STORE, SVT)) &&
TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
DAG, *ST->getMemOperand())) {
return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
// Turn 'store undef, Ptr' -> nothing.
if (Value.isUndef() && ST->isUnindexed())
return Chain;
// Try to infer better alignment information than the store already has.
if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) {
if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
if (*Alignment > ST->getAlign() &&
isAligned(*Alignment, ST->getSrcValueOffset())) {
SDValue NewStore =
DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
ST->getMemoryVT(), *Alignment,
ST->getMemOperand()->getFlags(), ST->getAAInfo());
// NewStore will always be N as we are only refining the alignment
assert(NewStore.getNode() == N);
// Try transforming a pair floating point load / store ops to integer
// load / store ops.
if (SDValue NewST = TransformFPLoadStorePair(N))
return NewST;
// Try transforming several stores into STORE (BSWAP).
if (SDValue Store = MatchStoreCombine(ST))
return Store;
if (ST->isUnindexed()) {
// Walk up chain skipping non-aliasing memory nodes, on this store and any
// adjacent stores.
if (findBetterNeighborChains(ST)) {
// replaceStoreChain uses CombineTo, which handled all of the worklist
// manipulation. Return the original node to not do anything else.
return SDValue(ST, 0);
Chain = ST->getChain();
// FIXME: is there such a thing as a truncating indexed store?
if (ST->isTruncatingStore() && ST->isUnindexed() &&
Value.getValueType().isInteger() &&
(!isa<ConstantSDNode>(Value) ||
!cast<ConstantSDNode>(Value)->isOpaque())) {
APInt TruncDemandedBits =
// See if we can simplify the input to this truncstore with knowledge that
// only the low bits are being used. For example:
// "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits))
return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
// Otherwise, see if we can simplify the operation with
// SimplifyDemandedBits, which only works if the value has a single use.
if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
// Re-visit the store if anything changed and the store hasn't been merged
// with another node (N is deleted) SimplifyDemandedBits will add Value's
// node back to the worklist if necessary, but we also need to re-visit
// the Store node itself.
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
// If this is a load followed by a store to the same location, then the store
// is dead/noop.
// TODO: Can relax for unordered atomics (see D66309)
if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
ST->isUnindexed() && ST->isSimple() &&
// There can't be any side effects between the load and store, such as
// a call or store.
Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
// The store is dead, remove it.
return Chain;
// TODO: Can relax for unordered atomics (see D66309)
if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
if (ST->isUnindexed() && ST->isSimple() &&
ST1->isUnindexed() && ST1->isSimple()) {
if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value &&
ST->getMemoryVT() == ST1->getMemoryVT()) {
// If this is a store followed by a store with the same value to the
// same location, then the store is dead/noop.
return Chain;
if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
!ST1->getBasePtr().isUndef() &&
// BaseIndexOffset and the code below requires knowing the size
// of a vector, so bail out if MemoryVT is scalable.
!ST1->getMemoryVT().isScalableVector()) {
const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
unsigned STBitSize = ST->getMemoryVT().getSizeInBits();
unsigned ChainBitSize = ST1->getMemoryVT().getSizeInBits();
// If this is a store who's preceding store to a subset of the current
// location and no one other node is chained to that store we can
// effectively drop the store. Do not remove stores to undef as they may
// be used as data sinks.
if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) {
CombineTo(ST1, ST1->getChain());
return SDValue();
// If this is an FP_ROUND or TRUNC followed by a store, fold this into a
// truncating store. We can do this even if this is already a truncstore.
if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
&& Value.getNode()->hasOneUse() && ST->isUnindexed() &&
ST->getMemoryVT())) {
return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
Ptr, ST->getMemoryVT(), ST->getMemOperand());
// Always perform this optimization before types are legal. If the target
// prefers, also try this after legalization to catch stores that were created
// by intrinsics or other nodes.
if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) {
while (true) {
// There can be multiple store sequences on the same chain.
// Keep trying to merge store sequences until we are unable to do so
// or until we merge the last store on the chain.
bool Changed = mergeConsecutiveStores(ST);
if (!Changed) break;
// Return N as merge only uses CombineTo and no worklist clean
// up is necessary.
if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))
return SDValue(N, 0);
// Try transforming N to an indexed store.
if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
return SDValue(N, 0);
// Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
// Make sure to do this only after attempting to merge stores in order to
// avoid changing the types of some subset of stores due to visit order,
// preventing their merging.
if (isa<ConstantFPSDNode>(ST->getValue())) {
if (SDValue NewSt = replaceStoreOfFPConstant(ST))
return NewSt;
if (SDValue NewSt = splitMergedValStore(ST))
return NewSt;
return ReduceLoadOpStoreWidth(N);
SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
const auto *LifetimeEnd = cast<LifetimeSDNode>(N);
if (!LifetimeEnd->hasOffset())
return SDValue();
const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(),
LifetimeEnd->getOffset(), false);
// We walk up the chains to find stores.
SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
while (!Chains.empty()) {
SDValue Chain = Chains.back();
if (!Chain.hasOneUse())
switch (Chain.getOpcode()) {
case ISD::TokenFactor:
for (unsigned Nops = Chain.getNumOperands(); Nops;)
// We can forward past any lifetime start/end that can be proven not to
// alias the node.
if (!isAlias(Chain.getNode(), N))
case ISD::STORE: {
StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain);
// TODO: Can relax for unordered atomics (see D66309)
if (!ST->isSimple() || ST->isIndexed())
const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
// If we store purely within object bounds just before its lifetime ends,
// we can remove the store.
if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
ST->getMemoryVT().getStoreSizeInBits())) {
LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
dbgs() << "\nwithin LIFETIME_END of : ";
LifetimeEndBase.dump(); dbgs() << "\n");
CombineTo(ST, ST->getChain());
return SDValue(N, 0);
return SDValue();
/// For the instruction sequence of store below, F and I values
/// are bundled together as an i64 value before being stored into memory.
/// Sometimes it is more efficent to generate separate stores for F and I,
/// which can remove the bitwise instructions or sink them to colder places.
/// (store (or (zext (bitcast F to i32) to i64),
/// (shl (zext I to i64), 32)), addr) -->
/// (store F, addr) and (store I, addr+4)
/// Similarly, splitting for other merged store can also be beneficial, like:
/// For pair of {i32, i32}, i64 store --> two i32 stores.
/// For pair of {i32, i16}, i64 store --> two i32 stores.
/// For pair of {i16, i16}, i32 store --> two i16 stores.
/// For pair of {i16, i8}, i32 store --> two i16 stores.
/// For pair of {i8, i8}, i16 store --> two i8 stores.
/// We allow each target to determine specifically which kind of splitting is
/// supported.
/// The store patterns are commonly seen from the simple code snippet below
/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
/// void goo(const std::pair<int, float> &);
/// hoo() {
/// ...
/// goo(std::make_pair(tmp, ftmp));
/// ...
/// }
SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
if (OptLevel == CodeGenOpt::None)
return SDValue();
// Can't change the number of memory accesses for a volatile store or break
// atomicity for an atomic one.
if (!ST->isSimple())
return SDValue();
SDValue Val = ST->getValue();
// Match OR operand.
if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
return SDValue();
// Match SHL operand and get Lower and Higher parts of Val.
SDValue Op1 = Val.getOperand(0);
SDValue Op2 = Val.getOperand(1);
SDValue Lo, Hi;
if (Op1.getOpcode() != ISD::SHL) {
std::swap(Op1, Op2);
if (Op1.getOpcode() != ISD::SHL)
return SDValue();
Lo = Op2;
Hi = Op1.getOperand(0);
if (!Op1.hasOneUse())
return SDValue();
// Match shift amount to HalfValBitSize.
unsigned HalfValBitSize = Val.getValueSizeInBits() / 2;
ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
return SDValue();
// Lo and Hi are zero-extended from int with size less equal than 32
// to i64.
if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
!Lo.getOperand(0).getValueType().isScalarInteger() ||
Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize ||
Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
!Hi.getOperand(0).getValueType().isScalarInteger() ||
Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize)
return SDValue();
// Use the EVT of low and high parts before bitcast as the input
// of target query.
EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST)
? Lo.getOperand(0).getValueType()
: Lo.getValueType();
EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST)
? Hi.getOperand(0).getValueType()
: Hi.getValueType();
if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
return SDValue();
// Start to split store.
unsigned Alignment = ST->getAlignment();
MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
AAMDNodes AAInfo = ST->getAAInfo();
// Change the sizes of Lo and Hi's value types to HalfValBitSize.
EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
SDValue Chain = ST->getChain();
SDValue Ptr = ST->getBasePtr();
// Lower value store.
SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
ST->getAlignment(), MMOFlags, AAInfo);
Ptr = DAG.getMemBasePlusOffset(Ptr, HalfValBitSize / 8, DL);
// Higher value store.
SDValue St1 =
DAG.getStore(St0, DL, Hi, Ptr,
ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
Alignment / 2, MMOFlags, AAInfo);
return St1;
/// Convert a disguised subvector insertion into a shuffle:
SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT &&
"Expected extract_vector_elt");
SDValue InsertVal = N->getOperand(1);
SDValue Vec = N->getOperand(0);
// (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N),
// InsIndex)
// --> (vector_shuffle X, Y) and variations where shuffle operands may be
if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() &&
InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(InsertVal.getOperand(1))) {
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode());
ArrayRef<int> Mask = SVN->getMask();
SDValue X = Vec.getOperand(0);
SDValue Y = Vec.getOperand(1);
// Vec's operand 0 is using indices from 0 to N-1 and
// operand 1 from N to 2N - 1, where N is the number of
// elements in the vectors.
SDValue InsertVal0 = InsertVal.getOperand(0);
int ElementOffset = -1;
// We explore the inputs of the shuffle in order to see if we find the
// source of the extract_vector_elt. If so, we can use it to modify the
// shuffle rather than perform an insert_vector_elt.
SmallVector<std::pair<int, SDValue>, 8> ArgWorkList;
ArgWorkList.emplace_back(Mask.size(), Y);
ArgWorkList.emplace_back(0, X);
while (!ArgWorkList.empty()) {
int ArgOffset;
SDValue ArgVal;
std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val();
if (ArgVal == InsertVal0) {
ElementOffset = ArgOffset;
// Peek through concat_vector.
if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) {
int CurrentArgOffset =
ArgOffset + ArgVal.getValueType().getVectorNumElements();
int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements();
for (SDValue Op : reverse(ArgVal->ops())) {
CurrentArgOffset -= Step;
ArgWorkList.emplace_back(CurrentArgOffset, Op);
// Make sure we went through all the elements and did not screw up index
// computation.
assert(CurrentArgOffset == ArgOffset);
if (ElementOffset != -1) {
SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());
auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1));
NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue();
assert(NewMask[InsIndex] <
(int)(2 * Vec.getValueType().getVectorNumElements()) &&
NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound");
SDValue LegalShuffle =
TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X,
Y, NewMask, DAG);
if (LegalShuffle)
return LegalShuffle;
// insert_vector_elt V, (bitcast X from vector type), IdxC -->
// bitcast(shuffle (bitcast V), (extended X), Mask)
// Note: We do not use an insert_subvector node because that requires a
// legal subvector type.
if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
return SDValue();
SDValue SubVec = InsertVal.getOperand(0);
SDValue DestVec = N->getOperand(0);
EVT SubVecVT = SubVec.getValueType();
EVT VT = DestVec.getValueType();
unsigned NumSrcElts = SubVecVT.getVectorNumElements();
// If the source only has a single vector element, the cost of creating adding
// it to a vector is likely to exceed the cost of a insert_vector_elt.
if (NumSrcElts == 1)
return SDValue();
unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
unsigned NumMaskVals = ExtendRatio * NumSrcElts;
// Step 1: Create a shuffle mask that implements this insert operation. The
// vector that we are inserting into will be operand 0 of the shuffle, so
// those elements are just 'i'. The inserted subvector is in the first
// positions of operand 1 of the shuffle. Example:
// insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
SmallVector<int, 16> Mask(NumMaskVals);
for (unsigned i = 0; i != NumMaskVals; ++i) {
if (i / NumSrcElts == InsIndex)
Mask[i] = (i % NumSrcElts) + NumMaskVals;
Mask[i] = i;
// Bail out if the target can not handle the shuffle we want to create.
EVT SubVecEltVT = SubVecVT.getVectorElementType();
EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
return SDValue();
// Step 2: Create a wide vector from the inserted source vector by appending
// undefined elements. This is the same size as our destination vector.
SDLoc DL(N);
SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
ConcatOps[0] = SubVec;
SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);
// Step 3: Shuffle in the padded subvector.
SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
return DAG.getBitcast(VT, Shuf);
SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
SDValue InVec = N->getOperand(0);
SDValue InVal = N->getOperand(1);
SDValue EltNo = N->getOperand(2);
SDLoc DL(N);
EVT VT = InVec.getValueType();
auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
// Insert into out-of-bounds element is undefined.
if (IndexC && VT.isFixedLengthVector() &&
IndexC->getZExtValue() >= VT.getVectorNumElements())
return DAG.getUNDEF(VT);
// Remove redundant insertions:
// (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
return InVec;
if (!IndexC) {
// If this is variable insert to undef vector, it might be better to splat:
// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
if (VT.isScalableVector())
return DAG.getSplatVector(VT, DL, InVal);
else {
SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
return DAG.getBuildVector(VT, DL, Ops);
return SDValue();
if (VT.isScalableVector())
return SDValue();
unsigned NumElts = VT.getVectorNumElements();
// We must know which element is being inserted for folds below here.
unsigned Elt = IndexC->getZExtValue();
if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
return Shuf;
// Canonicalize insert_vector_elt dag nodes.
// Example:
// (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
// -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
// Do this only if the child insert_vector node has one use; also
// do this only if indices are both constants and Idx1 < Idx0.
if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
&& isa<ConstantSDNode>(InVec.getOperand(2))) {
unsigned OtherElt = InVec.getConstantOperandVal(2);
if (Elt < OtherElt) {
// Swap nodes.
InVec.getOperand(0), InVal, EltNo);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
// If we can't generate a legal BUILD_VECTOR, exit
if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
return SDValue();
// Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
// be converted to a BUILD_VECTOR). Fill in the Ops vector with the
// vector elements.
SmallVector<SDValue, 8> Ops;
// Do not combine these two vectors if the output vector will not replace
// the input vector.
if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
} else if (InVec.isUndef()) {
Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
} else {
return SDValue();
assert(Ops.size() == NumElts && "Unexpected vector size");
// Insert the element
if (Elt < Ops.size()) {
// All the operands of BUILD_VECTOR must have the same type;
// we enforce that here.
EVT OpVT = Ops[0].getValueType();
Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
// Return the new vector
return DAG.getBuildVector(VT, DL, Ops);
SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
SDValue EltNo,
LoadSDNode *OriginalLoad) {
EVT ResultVT = EVE->getValueType(0);
EVT VecEltVT = InVecVT.getVectorElementType();
Align Alignment = OriginalLoad->getAlign();
Align NewAlign = DAG.getDataLayout().getABITypeAlign(
if (NewAlign > Alignment ||
!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
return SDValue();
ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
return SDValue();
Alignment = NewAlign;
SDValue NewPtr = OriginalLoad->getBasePtr();
SDValue Offset;
EVT PtrType = NewPtr.getValueType();
MachinePointerInfo MPI;
if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
int Elt = ConstEltNo->getZExtValue();
unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
Offset = DAG.getConstant(PtrOff, DL, PtrType);
MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
} else {
Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType);
Offset = DAG.getNode(
ISD::MUL, DL, PtrType, Offset,
DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType));
// Discard the pointer info except the address space because the memory
// operand can't represent this new access since the offset is variable.
MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
NewPtr = DAG.getMemBasePlusOffset(NewPtr, Offset, DL);
// The replacement we need to do here is a little tricky: we need to
// replace an extractelement of a load with a load.
// Use ReplaceAllUsesOfValuesWith to do the replacement.
// Note that this replacement assumes that the extractvalue is the only
// use of the load; that's okay because we don't want to perform this
// transformation in other cases anyway.
SDValue Load;
SDValue Chain;
if (ResultVT.bitsGT(VecEltVT)) {
// If the result type of vextract is wider than the load, then issue an
// extending load instead.
ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
Alignment, OriginalLoad->getMemOperand()->getFlags(),
Chain = Load.getValue(1);
} else {
Load = DAG.getLoad(
VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Alignment,
OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo());
Chain = Load.getValue(1);
if (ResultVT.bitsLT(VecEltVT))
Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
Load = DAG.getBitcast(ResultVT, Load);
WorklistRemover DeadNodes(*this);
SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
SDValue To[] = { Load, Chain };
DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
// Make sure to revisit this node to clean it up; it will usually be dead.
// Since we're explicitly calling ReplaceAllUses, add the new node to the
// worklist explicitly as well.
return SDValue(EVE, 0);
/// Transform a vector binary operation into a scalar binary operation by moving
/// the math/logic after an extract element of a vector.
static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
bool LegalOperations) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Vec = ExtElt->getOperand(0);
SDValue Index = ExtElt->getOperand(1);
auto *IndexC = dyn_cast<ConstantSDNode>(Index);
if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
Vec.getNode()->getNumValues() != 1)
return SDValue();
// Targets may want to avoid this to prevent an expensive register transfer.
if (!TLI.shouldScalarizeBinop(Vec))
return SDValue();
// Extracting an element of a vector constant is constant-folded, so this
// transform is just replacing a vector op with a scalar op while moving the
// extract.
SDValue Op0 = Vec.getOperand(0);
SDValue Op1 = Vec.getOperand(1);
if (isAnyConstantBuildVector(Op0, true) ||
isAnyConstantBuildVector(Op1, true)) {
// extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
// extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
SDLoc DL(ExtElt);
EVT VT = ExtElt->getValueType(0);
SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
return SDValue();
SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
SDValue VecOp = N->getOperand(0);
SDValue Index = N->getOperand(1);
EVT ScalarVT = N->getValueType(0);
EVT VecVT = VecOp.getValueType();
if (VecOp.isUndef())
return DAG.getUNDEF(ScalarVT);
// extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
// This only really matters if the index is non-constant since other combines
// on the constant elements already work.
SDLoc DL(N);
if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT &&
Index == VecOp.getOperand(2)) {
SDValue Elt = VecOp.getOperand(1);
return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt;
// (vextract (scalar_to_vector val, 0) -> val
if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) {
// Only 0'th element of SCALAR_TO_VECTOR is defined.
if (DAG.isKnownNeverZero(Index))
return DAG.getUNDEF(ScalarVT);
// Check if the result type doesn't match the inserted element type. A
// SCALAR_TO_VECTOR may truncate the inserted element and the
// EXTRACT_VECTOR_ELT may widen the extracted vector.
SDValue InOp = VecOp.getOperand(0);
if (InOp.getValueType() != ScalarVT) {
assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
return DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
return InOp;
// extract_vector_elt of out-of-bounds element -> UNDEF
auto *IndexC = dyn_cast<ConstantSDNode>(Index);
if (IndexC && VecVT.isFixedLengthVector() &&
return DAG.getUNDEF(ScalarVT);
// extract_vector_elt (build_vector x, y), 1 -> y
if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
TLI.isTypeLegal(VecVT) &&
(VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) {
assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||
VecVT.isFixedLengthVector()) &&
"BUILD_VECTOR used for scalable vectors");
unsigned IndexVal =
VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;
SDValue Elt = VecOp.getOperand(IndexVal);
EVT InEltVT = Elt.getValueType();
// Sometimes build_vector's scalar input types do not match result type.
if (ScalarVT == InEltVT)
return Elt;
// TODO: It may be useful to truncate if free if the build_vector implicitly
// converts.
if (VecVT.isScalableVector())
return SDValue();
// All the code from this point onwards assumes fixed width vectors, but it's
// possible that some of the combinations could be made to work for scalable
// vectors too.
unsigned NumElts = VecVT.getVectorNumElements();
unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
// TODO: These transforms should not require the 'hasOneUse' restriction, but
// there are regressions on multiple targets without it. We can end up with a
// mess of scalar and vector code if we reduce only part of the DAG to scalar.
if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() &&
VecOp.hasOneUse()) {
// The vector index of the LSBs of the source depend on the endian-ness.
bool IsLE = DAG.getDataLayout().isLittleEndian();
unsigned ExtractIndex = IndexC->getZExtValue();
// extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
unsigned BCTruncElt = IsLE ? 0 : NumElts - 1;
SDValue BCSrc = VecOp.getOperand(0);
if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc);
if (LegalTypes && BCSrc.getValueType().isInteger() &&
BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
// ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
// trunc i64 X to i32
SDValue X = BCSrc.getOperand(0);
assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() &&
"Extract element and scalar to vector can't change element type "
"from FP to integer.");
unsigned XBitWidth = X.getValueSizeInBits();
BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
// An extract element return value type can be wider than its vector
// operand element type. In that case, the high bits are undefined, so
// it's possible that we may need to extend rather than truncate.
if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
assert(XBitWidth % VecEltBitWidth == 0 &&
"Scalar bitwidth must be a multiple of vector element bitwidth");
return DAG.getAnyExtOrTrunc(X, DL, ScalarVT);
if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations))
return BO;
// We only perform this optimization before the op legalization phase because
// we may introduce new vector instructions which are not backed by TD
// patterns. For example on AVX, extracting elements from a wide vector
// without using extract_subvector. However, if we can find an underlying
// scalar value, then we can always use that.
if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) {
auto *Shuf = cast<ShuffleVectorSDNode>(VecOp);
// Find the new index to extract from.
int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue());
// Extracting an undef index is undef.
if (OrigElt == -1)
return DAG.getUNDEF(ScalarVT);
// Select the right vector half to extract from.
SDValue SVInVec;
if (OrigElt < (int)NumElts) {
SVInVec = VecOp.getOperand(0);
} else {
SVInVec = VecOp.getOperand(1);
OrigElt -= NumElts;
if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
SDValue InOp = SVInVec.getOperand(OrigElt);
if (InOp.getValueType() != ScalarVT) {
assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
return InOp;
// FIXME: We should handle recursing on other vector shuffles and
// scalar_to_vector here as well.
if (!LegalOperations ||
// FIXME: Should really be just isOperationLegalOrCustom.
TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
DAG.getVectorIdxConstant(OrigElt, DL));
// If only EXTRACT_VECTOR_ELT nodes use the source vector we can
// simplify it based on the (valid) extraction indices.
if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) {
return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Use->getOperand(0) == VecOp &&
})) {
APInt DemandedElts = APInt::getNullValue(NumElts);
for (SDNode *Use : VecOp->uses()) {
auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
if (CstElt->getAPIntValue().ult(NumElts))
if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) {
// We simplified the vector operand of this extract element. If this
// extract is not dead, visit it again so it is folded properly.
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth);
if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
// We simplified the vector operand of this extract element. If this
// extract is not dead, visit it again so it is folded properly.
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
// Everything under here is trying to match an extract of a loaded value.
// If the result of load has to be truncated, then it's not necessarily
// profitable.
bool BCNumEltsChanged = false;
EVT ExtVT = VecVT.getVectorElementType();
if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT))
return SDValue();
if (VecOp.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
if (!VecOp.hasOneUse())
return SDValue();
EVT BCVT = VecOp.getOperand(0).getValueType();
if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
return SDValue();
if (NumElts != BCVT.getVectorNumElements())
BCNumEltsChanged = true;
VecOp = VecOp.getOperand(0);
ExtVT = BCVT.getVectorElementType();
// extract (vector load $addr), i --> load $addr + i * size
if (!LegalOperations && !IndexC && VecOp.hasOneUse() &&
ISD::isNormalLoad(VecOp.getNode()) &&
!Index->hasPredecessor(VecOp.getNode())) {
auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
if (VecLoad && VecLoad->isSimple())
return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
// Perform only after legalization to ensure build_vector / vector_shuffle
// optimizations have already been done.
if (!LegalOperations || !IndexC)
return SDValue();
// (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
// (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
// (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
int Elt = IndexC->getZExtValue();
LoadSDNode *LN0 = nullptr;
if (ISD::isNormalLoad(VecOp.getNode())) {
LN0 = cast<LoadSDNode>(VecOp);
} else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
VecOp.getOperand(0).getValueType() == ExtVT &&
ISD::isNormalLoad(VecOp.getOperand(0).getNode())) {
// Don't duplicate a load with other uses.
if (!VecOp.hasOneUse())
return SDValue();
LN0 = cast<LoadSDNode>(VecOp.getOperand(0));
if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) {
// (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
// =>
// (load $addr+1*size)
// Don't duplicate a load with other uses.
if (!VecOp.hasOneUse())
return SDValue();
// If the bit convert changed the number of elements, it is unsafe
// to examine the mask.
if (BCNumEltsChanged)
return SDValue();
// Select the input vector, guarding against out of range extract vector.
int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt);
VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1);
if (VecOp.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
if (!VecOp.hasOneUse())
return SDValue();
VecOp = VecOp.getOperand(0);
if (ISD::isNormalLoad(VecOp.getNode())) {
LN0 = cast<LoadSDNode>(VecOp);
Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts;
Index = DAG.getConstant(Elt, DL, Index.getValueType());
} else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged &&
VecVT.getVectorElementType() == ScalarVT &&
(!LegalTypes ||
VecOp.getOperand(0).getValueType().getVectorElementType()))) {
// extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0
// -> extract_vector_elt a, 0
// extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1
// -> extract_vector_elt a, 1
// extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2
// -> extract_vector_elt b, 0
// extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3
// -> extract_vector_elt b, 1
SDLoc SL(N);
EVT ConcatVT = VecOp.getOperand(0).getValueType();
unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,
SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);
ConcatOp, NewIdx);
return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);
// Make sure we found a non-volatile load and the extractelement is
// the only use.
if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple())
return SDValue();
// If Idx was -1 above, Elt is going to be -1, so just return undef.
if (Elt == -1)
return DAG.getUNDEF(LVT);
return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0);
// Simplify (build_vec (ext )) to (bitcast (build_vec ))
SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
// We perform this optimization post type-legalization because
// the type-legalizer often scalarizes integer-promoted vectors.
// Performing this optimization before may create bit-casts which
// will be type-legalized to complex code sequences.
// We perform this optimization only before the operation legalizer because we
// may introduce illegal operations.
if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
return SDValue();
unsigned NumInScalars = N->getNumOperands();
SDLoc DL(N);
EVT VT = N->getValueType(0);
// Check to see if this is a BUILD_VECTOR of a bunch of values
// which come from any_extend or zero_extend nodes. If so, we can create
// a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
// optimizations. We do not handle sign-extend because we can't fill the sign
// using shuffles.
EVT SourceType = MVT::Other;
bool AllAnyExt = true;
for (unsigned i = 0; i != NumInScalars; ++i) {
SDValue In = N->getOperand(i);
// Ignore undef inputs.
if (In.isUndef()) continue;
bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND;
bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
// Abort if the element is not an extension.
if (!ZeroExt && !AnyExt) {
SourceType = MVT::Other;
// The input is a ZeroExt or AnyExt. Check the original type.
EVT InTy = In.getOperand(0).getValueType();
// Check that all of the widened source types are the same.
if (SourceType == MVT::Other)
// First time.
SourceType = InTy;
else if (InTy != SourceType) {
// Multiple income types. Abort.
SourceType = MVT::Other;
// Check if all of the extends are ANY_EXTENDs.
AllAnyExt &= AnyExt;
// In order to have valid types, all of the inputs must be extended from the
// same source type and all of the inputs must be any or zero extend.
// Scalar sizes must be a power of two.
EVT OutScalarTy = VT.getScalarType();
bool ValidTypes = SourceType != MVT::Other &&
isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
// Create a new simpler BUILD_VECTOR sequence which other optimizations can
// turn into a single shuffle instruction.
if (!ValidTypes)
return SDValue();
// If we already have a splat buildvector, then don't fold it if it means
// introducing zeros.
if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /*AllowUndefs*/ true))
return SDValue();
bool isLE = DAG.getDataLayout().isLittleEndian();
unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
assert(ElemRatio > 1 && "Invalid element size ratio");
SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
DAG.getConstant(0, DL, SourceType);
unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
// Populate the new build_vector
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
SDValue Cast = N->getOperand(i);
assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
Cast.getOpcode() == ISD::ZERO_EXTEND ||
Cast.isUndef()) && "Invalid cast opcode");
SDValue In;
if (Cast.isUndef())
In = DAG.getUNDEF(SourceType);
In = Cast->getOperand(0);
unsigned Index = isLE ? (i * ElemRatio) :
(i * ElemRatio + (ElemRatio - 1));
assert(Index < Ops.size() && "Invalid index");
Ops[Index] = In;
// The type of the new BUILD_VECTOR node.
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
"Invalid vector size");
// Check if the new vector type is legal.
if (!isTypeLegal(VecVT) ||
(!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) &&
TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)))
return SDValue();
// Make the new BUILD_VECTOR.
SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);
// The new BUILD_VECTOR node has the potential to be further optimized.
// Bitcast to the desired type.
return DAG.getBitcast(VT, BV);
// Simplify (build_vec (trunc $1)
// (trunc (srl $1 half-width))
// (trunc (srl $1 (2 * half-width))) …)
// to (bitcast $1)
SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {
assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
// Only for little endian
if (!DAG.getDataLayout().isLittleEndian())
return SDValue();
SDLoc DL(N);
EVT VT = N->getValueType(0);
EVT OutScalarTy = VT.getScalarType();
uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits();
// Only for power of two types to be sure that bitcast works well
if (!isPowerOf2_64(ScalarTypeBitsize))
return SDValue();
unsigned NumInScalars = N->getNumOperands();
// Look through bitcasts
auto PeekThroughBitcast = [](SDValue Op) {
if (Op.getOpcode() == ISD::BITCAST)
return Op.getOperand(0);
return Op;
// The source value where all the parts are extracted.
SDValue Src;
for (unsigned i = 0; i != NumInScalars; ++i) {
SDValue In = PeekThroughBitcast(N->getOperand(i));
// Ignore undef inputs.
if (In.isUndef()) continue;
if (In.getOpcode() != ISD::TRUNCATE)
return SDValue();
In = PeekThroughBitcast(In.getOperand(0));
if (In.getOpcode() != ISD::SRL) {
// For now only build_vec without shuffling, handle shifts here in the
// future.
if (i != 0)
return SDValue();
Src = In;
} else {
// In is SRL
SDValue part = PeekThroughBitcast(In.getOperand(0));
if (!Src) {
Src = part;
} else if (Src != part) {
// Vector parts do not stem from the same variable
return SDValue();
SDValue ShiftAmtVal = In.getOperand(1);
if (!isa<ConstantSDNode>(ShiftAmtVal))
return SDValue();
uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1);
// The extracted value is not extracted at the right position
if (ShiftAmt != i * ScalarTypeBitsize)
return SDValue();
// Only cast if the size is the same
if (Src.getValueType().getSizeInBits() != VT.getSizeInBits())
return SDValue();
return DAG.getBitcast(VT, Src);
SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
ArrayRef<int> VectorMask,
SDValue VecIn1, SDValue VecIn2,
unsigned LeftIdx, bool DidSplitVec) {
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
EVT VT = N->getValueType(0);
EVT InVT1 = VecIn1.getValueType();
EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
unsigned NumElems = VT.getVectorNumElements();
unsigned ShuffleNumElems = NumElems;
// If we artificially split a vector in two already, then the offsets in the
// operands will all be based off of VecIn1, even those in VecIn2.
unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements();
// We can't generate a shuffle node with mismatched input and output types.
// Try to make the types match the type of the output.
if (InVT1 != VT || InVT2 != VT) {
if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) {
// If the output vector length is a multiple of both input lengths,
// we can concatenate them and pad the rest with undefs.
unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits();
assert(NumConcats >= 2 && "Concat needs at least two inputs!");
SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
ConcatOps[0] = VecIn1;
ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
VecIn2 = SDValue();
} else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) {
if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
return SDValue();
if (!VecIn2.getNode()) {
// If we only have one input vector, and it's twice the size of the
// output, split it in two.
DAG.getVectorIdxConstant(NumElems, DL));
VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
// Since we now have shorter input vectors, adjust the offset of the
// second vector's start.
Vec2Offset = NumElems;
} else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) {
// VecIn1 is wider than the output, and we have another, possibly
// smaller input. Pad the smaller input with undefs, shuffle at the
// input vector width, and extract the output.
// The shuffle type is different than VT, so check legality again.
if (LegalOperations &&
!TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
return SDValue();
// Legalizing INSERT_SUBVECTOR is tricky - you basically have to
// lower it back into a BUILD_VECTOR. So if the inserted type is
// illegal, don't even try.
if (InVT1 != InVT2) {
if (!TLI.isTypeLegal(InVT2))
return SDValue();
DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
ShuffleNumElems = NumElems * 2;
} else {
// Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider
// than VecIn1. We can't handle this for now - this case will disappear
// when we start sorting the vectors by type.
return SDValue();
} else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
InVT1.getSizeInBits() == VT.getSizeInBits()) {
SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
ConcatOps[0] = VecIn2;
VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
} else {
// TODO: Support cases where the length mismatch isn't exactly by a
// factor of 2.
// TODO: Move this check upwards, so that if we have bad type
// mismatches, we don't create any DAG nodes.
return SDValue();
// Initialize mask to undef.
SmallVector<int, 8> Mask(ShuffleNumElems, -1);
// Only need to run up to the number of elements actually used, not the
// total number of elements in the shuffle - if we are shuffling a wider
// vector, the high lanes should be set to undef.
for (unsigned i = 0; i != NumElems; ++i) {
if (VectorMask[i] <= 0)
unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1);
if (VectorMask[i] == (int)LeftIdx) {
Mask[i] = ExtIndex;
} else if (VectorMask[i] == (int)LeftIdx + 1) {
Mask[i] = Vec2Offset + ExtIndex;
// The type the input vectors may have changed above.
InVT1 = VecIn1.getValueType();
// If we already have a VecIn2, it should have the same type as VecIn1.
// If we don't, get an undef/zero vector of the appropriate type.
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1);
assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");
SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask);
if (ShuffleNumElems > NumElems)
Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx);
return Shuffle;
static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
// First, determine where the build vector is not undef.
// TODO: We could extend this to handle zero elements as well as undefs.
int NumBVOps = BV->getNumOperands();
int ZextElt = -1;
for (int i = 0; i != NumBVOps; ++i) {
SDValue Op = BV->getOperand(i);
if (Op.isUndef())
if (ZextElt == -1)
ZextElt = i;
return SDValue();
// Bail out if there's no non-undef element.
if (ZextElt == -1)
return SDValue();
// The build vector contains some number of undef elements and exactly
// one other element. That other element must be a zero-extended scalar
// extracted from a vector at a constant index to turn this into a shuffle.
// Also, require that the build vector does not implicitly truncate/extend
// its elements.
// TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND.
EVT VT = BV->getValueType(0);
SDValue Zext = BV->getOperand(ZextElt);
if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() ||
Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) ||
Zext.getValueSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
// The zero-extend must be a multiple of the source size, and we must be
// building a vector of the same size as the source of the extract element.
SDValue Extract = Zext.getOperand(0);
unsigned DestSize = Zext.getValueSizeInBits();
unsigned SrcSize = Extract.getValueSizeInBits();
if (DestSize % SrcSize != 0 ||
Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits())
return SDValue();
// Create a shuffle mask that will combine the extracted element with zeros
// and undefs.
int ZextRatio = DestSize / SrcSize;
int NumMaskElts = NumBVOps * ZextRatio;
SmallVector<int, 32> ShufMask(NumMaskElts, -1);
for (int i = 0; i != NumMaskElts; ++i) {
if (i / ZextRatio == ZextElt) {
// The low bits of the (potentially translated) extracted element map to
// the source vector. The high bits map to zero. We will use a zero vector
// as the 2nd source operand of the shuffle, so use the 1st element of
// that vector (mask value is number-of-elements) for the high bits.
if (i % ZextRatio == 0)
ShufMask[i] = Extract.getConstantOperandVal(1);
ShufMask[i] = NumMaskElts;
// Undef elements of the build vector remain undef because we initialize
// the shuffle mask with -1.
// buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
// bitcast (shuffle V, ZeroVec, VectorMask)
EVT VecVT = Extract.getOperand(0).getValueType();
SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0),
ZeroVec, ShufMask, DAG);
if (!Shuf)
return SDValue();
return DAG.getBitcast(VT, Shuf);
// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
// operations. If the types of the vectors we're extracting from allow it,
// turn this into a vector_shuffle node.
SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
// Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
if (!isTypeLegal(VT))
return SDValue();
if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG))
return V;
// May only combine to shuffle after legalize if shuffle is legal.
if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
return SDValue();
bool UsesZeroVector = false;
unsigned NumElems = N->getNumOperands();
// Record, for each element of the newly built vector, which input vector
// that element comes from. -1 stands for undef, 0 for the zero vector,
// and positive values for the input vectors.
// VectorMask maps each element to its vector number, and VecIn maps vector
// numbers to their initial SDValues.
SmallVector<int, 8> VectorMask(NumElems, -1);
SmallVector<SDValue, 8> VecIn;
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Op = N->getOperand(i);
if (Op.isUndef())
// See if we can use a blend with a zero vector.
// TODO: Should we generalize this to a blend with an arbitrary constant
// vector?
if (isNullConstant(Op) || isNullFPConstant(Op)) {
UsesZeroVector = true;
VectorMask[i] = 0;
// Not an undef or zero. If the input is something other than an
// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
return SDValue();
SDValue ExtractedFromVec = Op.getOperand(0);
if (ExtractedFromVec.getValueType().isScalableVector())
return SDValue();
const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
return SDValue();
// All inputs must have the same element type as the output.
if (VT.getVectorElementType() !=
return SDValue();
// Have we seen this input vector before?
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
// a map back from SDValues to numbers isn't worth it.
unsigned Idx = std::distance(
VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec));
if (Idx == VecIn.size())
VectorMask[i] = Idx;
// If we didn't find at least one input vector, bail out.
if (VecIn.size() < 2)
return SDValue();
// If all the Operands of BUILD_VECTOR extract from same
// vector, then split the vector efficiently based on the maximum
// vector access index and adjust the VectorMask and
// VecIn accordingly.
bool DidSplitVec = false;
if (VecIn.size() == 2) {
unsigned MaxIndex = 0;
unsigned NearestPow2 = 0;
SDValue Vec = VecIn.back();
EVT InVT = Vec.getValueType();
SmallVector<unsigned, 8> IndexVec(NumElems, 0);
for (unsigned i = 0; i < NumElems; i++) {
if (VectorMask[i] <= 0)
unsigned Index = N->getOperand(i).getConstantOperandVal(1);
IndexVec[i] = Index;
MaxIndex = std::max(MaxIndex, Index);
NearestPow2 = PowerOf2Ceil(MaxIndex);
if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
NumElems * 2 < NearestPow2) {
unsigned SplitSize = NearestPow2 / 2;
EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
InVT.getVectorElementType(), SplitSize);
if (TLI.isTypeLegal(SplitVT)) {
SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
DAG.getVectorIdxConstant(SplitSize, DL));
SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
DAG.getVectorIdxConstant(0, DL));
DidSplitVec = true;
for (unsigned i = 0; i < NumElems; i++) {
if (VectorMask[i] <= 0)
VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
// TODO: We want to sort the vectors by descending length, so that adjacent
// pairs have similar length, and the longer vector is always first in the
// pair.
// TODO: Should this fire if some of the input vectors has illegal type (like
// it does now), or should we let legalization run its course first?
// Shuffle phase:
// Take pairs of vectors, and shuffle them so that the result has elements
// from these vectors in the correct places.
// For example, given:
// t10: i32 = extract_vector_elt t1, Constant:i64<0>
// t11: i32 = extract_vector_elt t2, Constant:i64<0>
// t12: i32 = extract_vector_elt t3, Constant:i64<0>
// t13: i32 = extract_vector_elt t1, Constant:i64<1>
// t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13
// We will generate:
// t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2
// t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef
SmallVector<SDValue, 4> Shuffles;
for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) {
unsigned LeftIdx = 2 * In + 1;
SDValue VecLeft = VecIn[LeftIdx];
SDValue VecRight =
(LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();
if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
VecRight, LeftIdx, DidSplitVec))
return SDValue();
// If we need the zero vector as an "ingredient" in the blend tree, add it
// to the list of shuffles.
if (UsesZeroVector)
Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT));
// If we only have one shuffle, we're done.
if (Shuffles.size() == 1)
return Shuffles[0];
// Update the vector mask to point to the post-shuffle vectors.
for (int &Vec : VectorMask)
if (Vec == 0)
Vec = Shuffles.size() - 1;
Vec = (Vec - 1) / 2;
// More than one shuffle. Generate a binary tree of blends, e.g. if from
// the previous step we got the set of shuffles t10, t11, t12, t13, we will
// generate:
// t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2
// t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4
// t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6
// t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8
// t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11
// t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13
// t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21
// Make sure the initial size of the shuffle list is even.
if (Shuffles.size() % 2)
for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) {
if (CurSize % 2) {
Shuffles[CurSize] = DAG.getUNDEF(VT);
for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) {
int Left = 2 * In;
int Right = 2 * In + 1;
SmallVector<int, 8> Mask(NumElems, -1);
for (unsigned i = 0; i != NumElems; ++i) {
if (VectorMask[i] == Left) {
Mask[i] = i;
VectorMask[i] = In;
} else if (VectorMask[i] == Right) {
Mask[i] = i + NumElems;
VectorMask[i] = In;
Shuffles[In] =
DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask);
return Shuffles[0];
// Try to turn a build vector of zero extends of extract vector elts into a
// a vector zero extend and possibly an extract subvector.
// TODO: Support sign extend?
// TODO: Allow undef elements?
SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
if (LegalOperations)
return SDValue();
EVT VT = N->getValueType(0);
bool FoundZeroExtend = false;
SDValue Op0 = N->getOperand(0);
auto checkElem = [&](SDValue Op) -> int64_t {
unsigned Opc = Op.getOpcode();
FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND);
if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) &&
Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0))
if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1)))
return C->getZExtValue();
return -1;
// Make sure the first element matches
// (zext (extract_vector_elt X, C))
int64_t Offset = checkElem(Op0);
if (Offset < 0)
return SDValue();
unsigned NumElems = N->getNumOperands();
SDValue In = Op0.getOperand(0).getOperand(0);
EVT InSVT = In.getValueType().getScalarType();
EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems);
// Don't create an illegal input type after type legalization.
if (LegalTypes && !TLI.isTypeLegal(InVT))
return SDValue();
// Ensure all the elements come from the same vector and are adjacent.
for (unsigned i = 1; i != NumElems; ++i) {
if ((Offset + i) != checkElem(N->getOperand(i)))
return SDValue();
SDLoc DL(N);
return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL,
VT, In);
SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
EVT VT = N->getValueType(0);
// A vector built entirely of undefs is undef.
if (ISD::allOperandsUndef(N))
return DAG.getUNDEF(VT);
// If this is a splat of a bitcast from another vector, change to a
// concat_vector.
// For example:
// (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) ->
// (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X))))
// If X is a build_vector itself, the concat can become a larger build_vector.
// TODO: Maybe this is useful for non-splat too?
if (!LegalOperations) {
if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) {
Splat = peekThroughBitcasts(Splat);
EVT SrcVT = Splat.getValueType();
if (SrcVT.isVector()) {
unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(),
SrcVT.getVectorElementType(), NumElts);
if (!LegalTypes || TLI.isTypeLegal(NewVT)) {
SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N),
NewVT, Ops);
return DAG.getBitcast(VT, Concat);
// A splat of a single element is a SPLAT_VECTOR if supported on the target.
if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand)
if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
assert(!V.isUndef() && "Splat of undef should have been handled earlier");
return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V);
// Check if we can express BUILD VECTOR via subvector extract.
if (!LegalTypes && (N->getNumOperands() > 1)) {
SDValue Op0 = N->getOperand(0);
auto checkElem = [&](SDValue Op) -> uint64_t {
if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
(Op0.getOperand(0) == Op.getOperand(0)))
if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
return CNode->getZExtValue();
return -1;
int Offset = checkElem(Op0);
for (unsigned i = 0; i < N->getNumOperands(); ++i) {
if (Offset + i != checkElem(N->getOperand(i))) {
Offset = -1;
if ((Offset == 0) &&
(Op0.getOperand(0).getValueType() == N->getValueType(0)))
return Op0.getOperand(0);
if ((Offset != -1) &&
((Offset % N->getValueType(0).getVectorNumElements()) ==
0)) // IDX must be multiple of output size.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
Op0.getOperand(0), Op0.getOperand(1));
if (SDValue V = convertBuildVecZextToZext(N))
return V;
if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
return V;
if (SDValue V = reduceBuildVecTruncToBitCast(N))
return V;
if (SDValue V = reduceBuildVecToShuffle(N))
return V;
return SDValue();
static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT OpVT = N->getOperand(0).getValueType();
// If the operands are legal vectors, leave them alone.
if (TLI.isTypeLegal(OpVT))
return SDValue();
SDLoc DL(N);
EVT VT = N->getValueType(0);
SmallVector<SDValue, 8> Ops;
EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
// Keep track of what we encounter.
bool AnyInteger = false;
bool AnyFP = false;
for (const SDValue &Op : N->ops()) {
if (ISD::BITCAST == Op.getOpcode() &&
else if (ISD::UNDEF == Op.getOpcode())
return SDValue();
// Note whether we encounter an integer or floating point scalar.
// If it's neither, bail out, it could be something weird like x86mmx.
EVT LastOpVT = Ops.back().getValueType();
if (LastOpVT.isFloatingPoint())
AnyFP = true;
else if (LastOpVT.isInteger())
AnyInteger = true;
return SDValue();
// If any of the operands is a floating point scalar bitcast to a vector,
// use floating point types throughout, and bitcast everything.
// Replace UNDEFs by another scalar UNDEF node, of the final desired type.
if (AnyFP) {
SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
if (AnyInteger) {
for (SDValue &Op : Ops) {
if (Op.getValueType() == SVT)
if (Op.isUndef())
Op = ScalarUndef;
Op = DAG.getBitcast(SVT, Op);
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
VT.getSizeInBits() / SVT.getSizeInBits());
return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
// most two distinct vectors the same size as the result, attempt to turn this
// into a legal shuffle.
static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
EVT OpVT = N->getOperand(0).getValueType();
+ // We currently can't generate an appropriate shuffle for a scalable vector.
+ if (VT.isScalableVector())
+ return SDValue();
int NumElts = VT.getVectorNumElements();
int NumOpElts = OpVT.getVectorNumElements();
SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
SmallVector<int, 8> Mask;
for (SDValue Op : N->ops()) {
Op = peekThroughBitcasts(Op);
// UNDEF nodes convert to UNDEF shuffle mask values.
if (Op.isUndef()) {
Mask.append((unsigned)NumOpElts, -1);
if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return SDValue();
// What vector are we extracting the subvector from and at what index?
SDValue ExtVec = Op.getOperand(0);
int ExtIdx = Op.getConstantOperandVal(1);
// We want the EVT of the original extraction to correctly scale the
// extraction index.
EVT ExtVT = ExtVec.getValueType();
ExtVec = peekThroughBitcasts(ExtVec);
// UNDEF nodes convert to UNDEF shuffle mask values.
if (ExtVec.isUndef()) {
Mask.append((unsigned)NumOpElts, -1);
// Ensure that we are extracting a subvector from a vector the same
// size as the result.
if (ExtVT.getSizeInBits() != VT.getSizeInBits())
return SDValue();
// Scale the subvector index to account for any bitcast.
int NumExtElts = ExtVT.getVectorNumElements();
if (0 == (NumExtElts % NumElts))
ExtIdx /= (NumExtElts / NumElts);
else if (0 == (NumElts % NumExtElts))
ExtIdx *= (NumElts / NumExtElts);
return SDValue();
// At most we can reference 2 inputs in the final shuffle.
if (SV0.isUndef() || SV0 == ExtVec) {
SV0 = ExtVec;
for (int i = 0; i != NumOpElts; ++i)
Mask.push_back(i + ExtIdx);
} else if (SV1.isUndef() || SV1 == ExtVec) {
SV1 = ExtVec;
for (int i = 0; i != NumOpElts; ++i)
Mask.push_back(i + ExtIdx + NumElts);
} else {
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
DAG.getBitcast(VT, SV1), Mask, DAG);
static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {
unsigned CastOpcode = N->getOperand(0).getOpcode();
switch (CastOpcode) {
// TODO: Allow more opcodes?
// case ISD::BITCAST:
// case ISD::TRUNCATE:
// case ISD::FP_EXTEND:
return SDValue();
EVT SrcVT = N->getOperand(0).getOperand(0).getValueType();
if (!SrcVT.isVector())
return SDValue();
// All operands of the concat must be the same kind of cast from the same
// source type.
SmallVector<SDValue, 4> SrcOps;
for (SDValue Op : N->ops()) {
if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() ||
Op.getOperand(0).getValueType() != SrcVT)
return SDValue();
// The wider cast must be supported by the target. This is unusual because
// the operation support type parameter depends on the opcode. In addition,
// check the other type in the cast to make sure this is really legal.
EVT VT = N->getValueType(0);
EVT SrcEltVT = SrcVT.getVectorElementType();
unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands();
EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
switch (CastOpcode) {
if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) ||
return SDValue();
if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) ||
return SDValue();
llvm_unreachable("Unexpected cast opcode");
// concat (cast X), (cast Y)... -> cast (concat X, Y...)
SDLoc DL(N);
SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps);
return DAG.getNode(CastOpcode, DL, VT, NewConcat);
SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
// If we only have one input vector, we don't need to do any concatenation.
if (N->getNumOperands() == 1)
return N->getOperand(0);
// Check if all of the operands are undefs.
EVT VT = N->getValueType(0);
if (ISD::allOperandsUndef(N))
return DAG.getUNDEF(VT);
// Optimize concat_vectors where all but the first of the vectors are undef.
if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) {
return Op.isUndef();
})) {
SDValue In = N->getOperand(0);
assert(In.getValueType().isVector() && "Must concat vectors");
// If the input is a concat_vectors, just make a larger concat by padding
// with smaller undefs.
if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) {
unsigned NumOps = N->getNumOperands() * In.getNumOperands();
SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end());
Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType()));
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
SDValue Scalar = peekThroughOneUseBitcasts(In);
// concat_vectors(scalar_to_vector(scalar), undef) ->
// scalar_to_vector(scalar)
if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR &&
Scalar.hasOneUse()) {
EVT SVT = Scalar.getValueType().getVectorElementType();
if (SVT == Scalar.getOperand(0).getValueType())
Scalar = Scalar.getOperand(0);
// concat_vectors(scalar, undef) -> scalar_to_vector(scalar)
if (!Scalar.getValueType().isVector()) {
// If the bitcast type isn't legal, it might be a trunc of a legal type;
// look through the trunc so we can still do the transform:
// concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
if (Scalar->getOpcode() == ISD::TRUNCATE &&
!TLI.isTypeLegal(Scalar.getValueType()) &&
Scalar = Scalar->getOperand(0);
EVT SclTy = Scalar.getValueType();
if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
return SDValue();
// Bail out if the vector size is not a multiple of the scalar size.
if (VT.getSizeInBits() % SclTy.getSizeInBits())
return SDValue();
unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
if (VNTNumElms < 2)
return SDValue();
EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
return SDValue();
SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar);
return DAG.getBitcast(VT, Res);
// Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
// We have already tested above for an UNDEF only concatenation.
// fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
// -> (BUILD_VECTOR A, B, ..., C, D, ...)
auto IsBuildVectorOrUndef = [](const SDValue &Op) {
return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
SmallVector<SDValue, 8> Opnds;
EVT SVT = VT.getScalarType();
if (!SVT.isFloatingPoint()) {
// If BUILD_VECTOR are from built from integer, they may have different
// operand types. Get the smallest type and truncate all operands to it.
bool FoundMinVT = false;
for (const SDValue &Op : N->ops())
if (ISD::BUILD_VECTOR == Op.getOpcode()) {
EVT OpSVT = Op.getOperand(0).getValueType();
MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
FoundMinVT = true;
assert(FoundMinVT && "Concat vector type mismatch");
for (const SDValue &Op : N->ops()) {
EVT OpVT = Op.getValueType();
unsigned NumElts = OpVT.getVectorNumElements();
if (ISD::UNDEF == Op.getOpcode())
Opnds.append(NumElts, DAG.getUNDEF(MinVT));
if (ISD::BUILD_VECTOR == Op.getOpcode()) {
if (SVT.isFloatingPoint()) {
assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
} else {
for (unsigned i = 0; i != NumElts; ++i)
DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
assert(VT.getVectorNumElements() == Opnds.size() &&
"Concat vector type mismatch");
return DAG.getBuildVector(VT, SDLoc(N), Opnds);
// Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
if (SDValue V = combineConcatVectorOfScalars(N, DAG))
return V;
if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
return V;
if (SDValue V = combineConcatVectorOfCasts(N, DAG))
return V;
// Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
- // nodes often generate nop CONCAT_VECTOR nodes.
- // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
- // place the incoming vectors at the exact same location.
+ // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR
+ // operands and look for a CONCAT operations that place the incoming vectors
+ // at the exact same location.
+ //
+ // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled.
SDValue SingleSource = SDValue();
- unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements();
+ unsigned PartNumElem =
+ N->getOperand(0).getValueType().getVectorMinNumElements();
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
SDValue Op = N->getOperand(i);
if (Op.isUndef())
// Check if this is the identity extract:
if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return SDValue();
// Find the single incoming vector for the extract_subvector.
if (SingleSource.getNode()) {
if (Op.getOperand(0) != SingleSource)
return SDValue();
} else {
SingleSource = Op.getOperand(0);
// Check the source type is the same as the type of the result.
// If not, this concat may extend the vector, so we can not
// optimize it away.
if (SingleSource.getValueType() != N->getValueType(0))
return SDValue();
// Check that we are reading from the identity index.
unsigned IdentityIndex = i * PartNumElem;
if (Op.getConstantOperandAPInt(1) != IdentityIndex)
return SDValue();
if (SingleSource.getNode())
return SingleSource;
return SDValue();
// Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
// if the subvector can be sourced for free.
static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) {
if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) {
return V.getOperand(1);
auto *IndexC = dyn_cast<ConstantSDNode>(Index);
if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
V.getOperand(0).getValueType() == SubVT &&
(IndexC->getZExtValue() % SubVT.getVectorNumElements()) == 0) {
uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorNumElements();
return V.getOperand(SubIdx);
return SDValue();
static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue BinOp = Extract->getOperand(0);
unsigned BinOpcode = BinOp.getOpcode();
if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1)
return SDValue();
EVT VecVT = BinOp.getValueType();
SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1);
if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType())
return SDValue();
SDValue Index = Extract->getOperand(1);
EVT SubVT = Extract->getValueType(0);
if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT))
return SDValue();
SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT);
SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT);
// TODO: We could handle the case where only 1 operand is being inserted by
// creating an extract of the other operand, but that requires checking
// number of uses and/or costs.
if (!Sub0 || !Sub1)
return SDValue();
// We are inserting both operands of the wide binop only to extract back
// to the narrow vector size. Eliminate all of the insert/extract:
// ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y
return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1,
/// If we are extracting a subvector produced by a wide binary operator try
/// to use a narrow binary operator and/or avoid concatenation and extraction.
static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
// TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
// some of these bailouts with other transforms.
if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG))
return V;
// The extract index must be a constant, so we can map it to a concat operand.
auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
if (!ExtractIndexC)
return SDValue();
// We are looking for an optionally bitcasted wide vector binary operator
// feeding an extract subvector.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
unsigned BOpcode = BinOp.getOpcode();
if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1)
return SDValue();
// Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be
// reduced to the unary fneg when it is visited, and we probably want to deal
// with fneg in a target-specific way.
if (BOpcode == ISD::FSUB) {
auto *C = isConstOrConstSplatFP(BinOp.getOperand(0), /*AllowUndefs*/ true);
if (C && C->getValueAPF().isNegZero())
return SDValue();
// The binop must be a vector type, so we can extract some fraction of it.
EVT WideBVT = BinOp.getValueType();
- if (!WideBVT.isVector())
+ // The optimisations below currently assume we are dealing with fixed length
+ // vectors. It is possible to add support for scalable vectors, but at the
+ // moment we've done no analysis to prove whether they are profitable or not.
+ if (!WideBVT.isFixedLengthVector())
return SDValue();
EVT VT = Extract->getValueType(0);
unsigned ExtractIndex = ExtractIndexC->getZExtValue();
assert(ExtractIndex % VT.getVectorNumElements() == 0 &&
"Extract index is not a multiple of the vector length.");
// Bail out if this is not a proper multiple width extraction.
unsigned WideWidth = WideBVT.getSizeInBits();
unsigned NarrowWidth = VT.getSizeInBits();
if (WideWidth % NarrowWidth != 0)
return SDValue();
// Bail out if we are extracting a fraction of a single operation. This can
// occur because we potentially looked through a bitcast of the binop.
unsigned NarrowingRatio = WideWidth / NarrowWidth;
unsigned WideNumElts = WideBVT.getVectorNumElements();
if (WideNumElts % NarrowingRatio != 0)
return SDValue();
// Bail out if the target does not support a narrower version of the binop.
EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
WideNumElts / NarrowingRatio);
if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
return SDValue();
// If extraction is cheap, we don't need to look at the binop operands
// for concat ops. The narrow binop alone makes this transform profitable.
// We can't just reuse the original extract index operand because we may have
// bitcasted.
unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();
unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&
BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {
// extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
SDLoc DL(Extract);
SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL);
BinOp.getOperand(0), NewExtIndex);
BinOp.getOperand(1), NewExtIndex);
SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
return DAG.getBitcast(VT, NarrowBinOp);
// Only handle the case where we are doubling and then halving. A larger ratio
// may require more than two narrow binops to replace the wide binop.
if (NarrowingRatio != 2)
return SDValue();
// TODO: The motivating case for this transform is an x86 AVX1 target. That
// target has temptingly almost legal versions of bitwise logic ops in 256-bit
// flavors, but no other 256-bit integer support. This could be extended to
// handle any binop, but that may require fixing/adding other folds to avoid
// codegen regressions.
if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
return SDValue();
// We need at least one concatenation operation of a binop operand to make
// this transform worthwhile. The concat must double the input vector sizes.
auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue {
if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2)
return V.getOperand(ConcatOpNum);
return SDValue();
SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0)));
SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1)));
if (SubVecL || SubVecR) {
// If a binop operand was not the result of a concat, we must extract a
// half-sized operand for our new narrow binop:
// extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
// extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC)
// extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN
SDLoc DL(Extract);
SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL);
SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL)
BinOp.getOperand(0), IndexC);
SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR)
BinOp.getOperand(1), IndexC);
SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
return DAG.getBitcast(VT, NarrowBinOp);
return SDValue();
/// If we are extracting a subvector from a wide vector load, convert to a
/// narrow load to eliminate the extraction:
/// (extract_subvector (load wide vector)) --> (load narrow vector)
static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
// TODO: Add support for big-endian. The offset calculation must be adjusted.
if (DAG.getDataLayout().isBigEndian())
return SDValue();
auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
if (!Ld || Ld->getExtensionType() || !Ld->isSimple() ||
return SDValue();
// Allow targets to opt-out.
EVT VT = Extract->getValueType(0);
// We can only create byte sized loads.
if (!VT.isByteSized())
return SDValue();
unsigned Index = ExtIdx->getZExtValue();
unsigned NumElts = VT.getVectorNumElements();
// If the index is a multiple of the extract element count, we can offset the
// address by the store size multiplied by the subvector index. Otherwise if
// the scalar type is byte sized, we can just use the index multiplied by
// the element size in bytes as the offset.
unsigned Offset;
if (Index % NumElts == 0)
Offset = (Index / NumElts) * VT.getStoreSize();
else if (VT.getScalarType().isByteSized())
Offset = Index * VT.getScalarType().getStoreSize();
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
return SDValue();
// The narrow load will be offset from the base address of the old load if
// we are extracting from something besides index 0 (little-endian).
SDLoc DL(Extract);
SDValue BaseAddr = Ld->getBasePtr();
// TODO: Use "BaseIndexOffset" to make this more effective.
SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
return NewLd;
SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
EVT NVT = N->getValueType(0);
SDValue V = N->getOperand(0);
uint64_t ExtIdx = N->getConstantOperandVal(1);
// Extract from UNDEF is UNDEF.
if (V.isUndef())
return DAG.getUNDEF(NVT);
if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
return NarrowLoad;
// Combine an extract of an extract into a single extract_subvector.
// ext (ext X, C), 0 --> ext X, C
if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
V.getConstantOperandVal(1)) &&
TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0),
// Try to move vector bitcast after extract_subv by scaling extraction index:
// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
if (V.getOpcode() == ISD::BITCAST &&
V.getOperand(0).getValueType().isVector()) {
SDValue SrcOp = V.getOperand(0);
EVT SrcVT = SrcOp.getValueType();
unsigned SrcNumElts = SrcVT.getVectorMinNumElements();
unsigned DestNumElts = V.getValueType().getVectorMinNumElements();
if ((SrcNumElts % DestNumElts) == 0) {
unsigned SrcDestRatio = SrcNumElts / DestNumElts;
ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio;
EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
SDLoc DL(N);
SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL);
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
V.getOperand(0), NewIndex);
return DAG.getBitcast(NVT, NewExtract);
if ((DestNumElts % SrcNumElts) == 0) {
unsigned DestSrcRatio = DestNumElts / SrcNumElts;
if ((NVT.getVectorMinNumElements() % DestSrcRatio) == 0) {
ElementCount NewExtEC = NVT.getVectorElementCount() / DestSrcRatio;
EVT ScalarVT = SrcVT.getScalarType();
if ((ExtIdx % DestSrcRatio) == 0) {
SDLoc DL(N);
unsigned IndexValScaled = ExtIdx / DestSrcRatio;
EVT NewExtVT =
EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC);
if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
SDValue NewExtract =
V.getOperand(0), NewIndex);
return DAG.getBitcast(NVT, NewExtract);
if (NewExtEC == 1 &&
TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
SDValue NewExtract =
V.getOperand(0), NewIndex);
return DAG.getBitcast(NVT, NewExtract);
if (V.getOpcode() == ISD::CONCAT_VECTORS) {
unsigned ExtNumElts = NVT.getVectorMinNumElements();
EVT ConcatSrcVT = V.getOperand(0).getValueType();
assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&
"Concat and extract subvector do not change element type");
assert((ExtIdx % ExtNumElts) == 0 &&
"Extract index is not a multiple of the input vector length.");
unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();
unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;
// If the concatenated source types match this extract, it's a direct
// simplification:
// extract_subvec (concat V1, V2, ...), i --> Vi
if (ConcatSrcNumElts == ExtNumElts)
return V.getOperand(ConcatOpIdx);
// If the concatenated source vectors are a multiple length of this extract,
// then extract a fraction of one of those source vectors directly from a
// concat operand. Example:
// v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
// v2i8 extract_subvec v8i8 Y, 6
if (NVT.isFixedLengthVector() && ConcatSrcNumElts % ExtNumElts == 0) {
SDLoc DL(N);
unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
"Trying to extract from >1 concat operand?");
assert(NewExtIdx % ExtNumElts == 0 &&
"Extract index is not a multiple of the input vector length.");
SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
V.getOperand(ConcatOpIdx), NewIndexC);
V = peekThroughBitcasts(V);
// If the input is a build vector. Try to make a smaller build vector.
if (V.getOpcode() == ISD::BUILD_VECTOR) {
EVT InVT = V.getValueType();
unsigned ExtractSize = NVT.getSizeInBits();
unsigned EltSize = InVT.getScalarSizeInBits();
// Only do this if we won't split any elements.
if (ExtractSize % EltSize == 0) {
unsigned NumElems = ExtractSize / EltSize;
EVT EltVT = InVT.getVectorElementType();
EVT ExtractVT =
NumElems == 1 ? EltVT
: EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);
if ((Level < AfterLegalizeDAG ||
(NumElems == 1 ||
TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
(!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize;
if (NumElems == 1) {
SDValue Src = V->getOperand(IdxVal);
if (EltVT != Src.getValueType())
Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);
return DAG.getBitcast(NVT, Src);
// Extract the pieces from the original build_vector.
SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
V->ops().slice(IdxVal, NumElems));
return DAG.getBitcast(NVT, BuildVec);
if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
// Handle only simple case where vector being inserted and vector
// being extracted are of same size.
EVT SmallVT = V.getOperand(1).getValueType();
if (!NVT.bitsEq(SmallVT))
return SDValue();
// Combine:
// (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
// Into:
// indices are equal or bit offsets are equal => V1
// otherwise => (extract_subvec V1, ExtIdx)
uint64_t InsIdx = V.getConstantOperandVal(2);
if (InsIdx * SmallVT.getScalarSizeInBits() ==
ExtIdx * NVT.getScalarSizeInBits())
return DAG.getBitcast(NVT, V.getOperand(1));
return DAG.getNode(
DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
return NarrowBOp;
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
return SDValue();
/// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles
/// followed by concatenation. Narrow vector ops may have better performance
/// than wide ops, and this can unlock further narrowing of other vector ops.
/// Targets can invert this transform later if it is not profitable.
static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
SelectionDAG &DAG) {
SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1);
if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 ||
!N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef())
return SDValue();
// Split the wide shuffle mask into halves. Any mask element that is accessing
// operand 1 is offset down to account for narrowing of the vectors.
ArrayRef<int> Mask = Shuf->getMask();
EVT VT = Shuf->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfNumElts = NumElts / 2;
SmallVector<int, 16> Mask0(HalfNumElts, -1);
SmallVector<int, 16> Mask1(HalfNumElts, -1);
for (unsigned i = 0; i != NumElts; ++i) {
if (Mask[i] == -1)
int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
if (i < HalfNumElts)
Mask0[i] = M;
Mask1[i - HalfNumElts] = M;
// Ask the target if this is a valid transform.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) ||
!TLI.isShuffleMaskLegal(Mask1, HalfVT))
return SDValue();
// shuffle (concat X, undef), (concat Y, undef), Mask -->
// concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)
SDValue X = N0.getOperand(0), Y = N1.getOperand(0);
SDLoc DL(Shuf);
SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0);
SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1);
// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
// or turn a shuffle of a single concat into simpler shuffle then concat.
static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
ArrayRef<int> Mask = SVN->getMask();
SmallVector<SDValue, 4> Ops;
EVT ConcatVT = N0.getOperand(0).getValueType();
unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
unsigned NumConcats = NumElts / NumElemsPerConcat;
auto IsUndefMaskElt = [](int i) { return i == -1; };
// Special case: shuffle(concat(A,B)) can be more efficiently represented
// as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
// half vector elements.
if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat),
IsUndefMaskElt)) {
N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0),
Mask.slice(0, NumElemsPerConcat));
N1 = DAG.getUNDEF(ConcatVT);
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
// Look at every vector that's inserted. We're looking for exact
// subvector-sized copies from a concatenated vector
for (unsigned I = 0; I != NumConcats; ++I) {
unsigned Begin = I * NumElemsPerConcat;
ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat);
// Make sure we're dealing with a copy.
if (llvm::all_of(SubMask, IsUndefMaskElt)) {
int OpIdx = -1;
for (int i = 0; i != (int)NumElemsPerConcat; ++i) {
if (IsUndefMaskElt(SubMask[i]))
if ((SubMask[i] % (int)NumElemsPerConcat) != i)
return SDValue();
int EltOpIdx = SubMask[i] / NumElemsPerConcat;
if (0 <= OpIdx && EltOpIdx != OpIdx)
return SDValue();
OpIdx = EltOpIdx;
assert(0 <= OpIdx && "Unknown concat_vectors op");
if (OpIdx < (int)N0.getNumOperands())
Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands()));
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
// a simplification in some sense, but it isn't appropriate in general: some
// BUILD_VECTORs are substantially cheaper than others. The general case
// of a BUILD_VECTOR requires inserting each element individually (or
// performing the equivalent in a temporary stack variable). A BUILD_VECTOR of
// all constants is a single constant pool load. A BUILD_VECTOR where each
// element is identical is a splat. A BUILD_VECTOR where most of the operands
// are undef lowers to a small number of element insertions.
// To deal with this, we currently use a bunch of mostly arbitrary heuristics.
// We don't fold shuffles where one side is a non-zero constant, and we don't
// fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
// non-constant operands. This seems to work out reasonably well in practice.
static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG,
const TargetLowering &TLI) {
EVT VT = SVN->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
SDValue N0 = SVN->getOperand(0);
SDValue N1 = SVN->getOperand(1);
if (!N0->hasOneUse())
return SDValue();
// If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
// discussed above.
if (!N1.isUndef()) {
if (!N1->hasOneUse())
return SDValue();
bool N0AnyConst = isAnyConstantBuildVector(N0);
bool N1AnyConst = isAnyConstantBuildVector(N1);
if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
return SDValue();
if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode()))
return SDValue();
// If both inputs are splats of the same value then we can safely merge this
// to a single BUILD_VECTOR with undef elements based on the shuffle mask.
bool IsSplat = false;
auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
if (BV0 && BV1)
if (SDValue Splat0 = BV0->getSplatValue())
IsSplat = (Splat0 == BV1->getSplatValue());
SmallVector<SDValue, 8> Ops;
SmallSet<SDValue, 16> DuplicateOps;
for (int M : SVN->getMask()) {
SDValue Op = DAG.getUNDEF(VT.getScalarType());
if (M >= 0) {
int Idx = M < (int)NumElts ? M : M - NumElts;
SDValue &S = (M < (int)NumElts ? N0 : N1);
if (S.getOpcode() == ISD::BUILD_VECTOR) {
Op = S.getOperand(Idx);
} else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
SDValue Op0 = S.getOperand(0);
Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType());
} else {
// Operand can't be combined - bail out.
return SDValue();
// Don't duplicate a non-constant BUILD_VECTOR operand unless we're
// generating a splat; semantically, this is fine, but it's likely to
// generate low-quality code if the target can't reconstruct an appropriate
// shuffle.
if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op))
if (!IsSplat && !DuplicateOps.insert(Op).second)
return SDValue();
// BUILD_VECTOR requires all inputs to be of the same type, find the
// maximum type and extend them all.
EVT SVT = VT.getScalarType();
if (SVT.isInteger())
for (SDValue &Op : Ops)
SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
if (SVT != VT.getScalarType())
for (SDValue &Op : Ops)
Op = TLI.isZExtFree(Op.getValueType(), SVT)
? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
: DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT);
return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
// Match shuffles that can be converted to any_vector_extend_in_reg.
// This is often generated during legalization.
// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG,
const TargetLowering &TLI,
bool LegalOperations) {
EVT VT = SVN->getValueType(0);
bool IsBigEndian = DAG.getDataLayout().isBigEndian();
// TODO Add support for big-endian when we have a test case.
if (!VT.isInteger() || IsBigEndian)
return SDValue();
unsigned NumElts = VT.getVectorNumElements();
unsigned EltSizeInBits = VT.getScalarSizeInBits();
ArrayRef<int> Mask = SVN->getMask();
SDValue N0 = SVN->getOperand(0);
// shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
for (unsigned i = 0; i != NumElts; ++i) {
if (Mask[i] < 0)
if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
return false;
return true;
// Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
// power-of-2 extensions as they are the most likely.
for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
// Check for non power of 2 vector sizes
if (NumElts % Scale != 0)
if (!isAnyExtend(Scale))
EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
// Never create an illegal type. Only create unsupported operations if we
// are pre-legalization.
if (TLI.isTypeLegal(OutVT))
if (!LegalOperations ||
TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
return DAG.getBitcast(VT,
SDLoc(SVN), OutVT, N0));
return SDValue();
// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
// each source element of a large type into the lowest elements of a smaller
// destination type. This is often generated during legalization.
// If the source node itself was a '*_extend_vector_inreg' node then we should
// then be able to remove it.
static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG) {
EVT VT = SVN->getValueType(0);
bool IsBigEndian = DAG.getDataLayout().isBigEndian();
// TODO Add support for big-endian when we have a test case.
if (!VT.isInteger() || IsBigEndian)
return SDValue();
SDValue N0 = peekThroughBitcasts(SVN->getOperand(0));
unsigned Opcode = N0.getOpcode();
return SDValue();
SDValue N00 = N0.getOperand(0);
ArrayRef<int> Mask = SVN->getMask();
unsigned NumElts = VT.getVectorNumElements();
unsigned EltSizeInBits = VT.getScalarSizeInBits();
unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();
if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
return SDValue();
unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;
// (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
// (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
// (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
for (unsigned i = 0; i != NumElts; ++i) {
if (Mask[i] < 0)
if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
return false;
return true;
// At the moment we just handle the case where we've truncated back to the
// same size as before the extension.
// TODO: handle more extension/truncation cases as cases arise.
if (EltSizeInBits != ExtSrcSizeInBits)
return SDValue();
// We can remove *extend_vector_inreg only if the truncation happens at
// the same scale as the extension.
if (isTruncate(ExtScale))
return DAG.getBitcast(VT, N00);
return SDValue();
// Combine shuffles of splat-shuffles of the form:
// shuffle (shuffle V, undef, splat-mask), undef, M
// If splat-mask contains undef elements, we need to be careful about
// introducing undef's in the folded mask which are not the result of composing
// the masks of the shuffles.
static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
SelectionDAG &DAG) {
if (!Shuf->getOperand(1).isUndef())
return SDValue();
auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
if (!Splat || !Splat->isSplat())
return SDValue();
ArrayRef<int> ShufMask = Shuf->getMask();
ArrayRef<int> SplatMask = Splat->getMask();
assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch");
// Prefer simplifying to the splat-shuffle, if possible. This is legal if
// every undef mask element in the splat-shuffle has a corresponding undef
// element in the user-shuffle's mask or if the composition of mask elements
// would result in undef.
// Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
// * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
// In this case it is not legal to simplify to the splat-shuffle because we
// may be exposing the users of the shuffle an undef element at index 1
// which was not there before the combine.
// * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
// In this case the composition of masks yields SplatMask, so it's ok to
// simplify to the splat-shuffle.
// * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
// In this case the composed mask includes all undef elements of SplatMask
// and in addition sets element zero to undef. It is safe to simplify to
// the splat-shuffle.
auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
ArrayRef<int> SplatMask) {
for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
if (UserMask[i] != -1 && SplatMask[i] == -1 &&
SplatMask[UserMask[i]] != -1)
return false;
return true;
if (CanSimplifyToExistingSplat(ShufMask, SplatMask))
return Shuf->getOperand(0);
// Create a new shuffle with a mask that is composed of the two shuffles'
// masks.
SmallVector<int, 32> NewMask;
for (int Idx : ShufMask)
NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
Splat->getOperand(0), Splat->getOperand(1),
/// Combine shuffle of shuffle of the form:
/// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X
static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf,
SelectionDAG &DAG) {
if (!OuterShuf->getOperand(1).isUndef())
return SDValue();
auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0));
if (!InnerShuf || !InnerShuf->getOperand(1).isUndef())
return SDValue();
ArrayRef<int> OuterMask = OuterShuf->getMask();
ArrayRef<int> InnerMask = InnerShuf->getMask();
unsigned NumElts = OuterMask.size();
assert(NumElts == InnerMask.size() && "Mask length mismatch");
SmallVector<int, 32> CombinedMask(NumElts, -1);
int SplatIndex = -1;
for (unsigned i = 0; i != NumElts; ++i) {
// Undef lanes remain undef.
int OuterMaskElt = OuterMask[i];
if (OuterMaskElt == -1)
// Peek through the shuffle masks to get the underlying source element.
int InnerMaskElt = InnerMask[OuterMaskElt];
if (InnerMaskElt == -1)
// Initialize the splatted element.
if (SplatIndex == -1)
SplatIndex = InnerMaskElt;
// Non-matching index - this is not a splat.
if (SplatIndex != InnerMaskElt)
return SDValue();
CombinedMask[i] = InnerMaskElt;
assert((all_of(CombinedMask, [](int M) { return M == -1; }) ||
getSplatIndex(CombinedMask) != -1) &&
"Expected a splat mask");
// TODO: The transform may be a win even if the mask is not legal.
EVT VT = OuterShuf->getValueType(0);
assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types");
if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT))
return SDValue();
return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0),
InnerShuf->getOperand(1), CombinedMask);
/// If the shuffle mask is taking exactly one element from the first vector
/// operand and passing through all other elements from the second vector
/// operand, return the index of the mask element that is choosing an element
/// from the first operand. Otherwise, return -1.
static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
int MaskSize = Mask.size();
int EltFromOp0 = -1;
// TODO: This does not match if there are undef elements in the shuffle mask.
// Should we ignore undefs in the shuffle mask instead? The trade-off is
// removing an instruction (a shuffle), but losing the knowledge that some
// vector lanes are not needed.
for (int i = 0; i != MaskSize; ++i) {
if (Mask[i] >= 0 && Mask[i] < MaskSize) {
// We're looking for a shuffle of exactly one element from operand 0.
if (EltFromOp0 != -1)
return -1;
EltFromOp0 = i;
} else if (Mask[i] != i + MaskSize) {
// Nothing from operand 1 can change lanes.
return -1;
return EltFromOp0;
/// If a shuffle inserts exactly one element from a source vector operand into
/// another vector operand and we can access the specified element as a scalar,
/// then we can eliminate the shuffle.
static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
SelectionDAG &DAG) {
// First, check if we are taking one element of a vector and shuffling that
// element into another vector.
ArrayRef<int> Mask = Shuf->getMask();
SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
SDValue Op0 = Shuf->getOperand(0);
SDValue Op1 = Shuf->getOperand(1);
int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
if (ShufOp0Index == -1) {
// Commute mask and check again.
ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
if (ShufOp0Index == -1)
return SDValue();
// Commute operands to match the commuted shuffle mask.
std::swap(Op0, Op1);
Mask = CommutedMask;
// The shuffle inserts exactly one element from operand 0 into operand 1.
// Now see if we can access that element as a scalar via a real insert element
// instruction.
// TODO: We can try harder to locate the element as a scalar. Examples: it
// could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
"Shuffle mask value must be from operand 0");
if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
return SDValue();
auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index])
return SDValue();
// There's an existing insertelement with constant insertion index, so we
// don't need to check the legality/profitability of a replacement operation
// that differs at most in the constant value. The target should be able to
// lower any of those in a similar way. If not, legalization will expand this
// to a scalar-to-vector plus shuffle.
// Note that the shuffle may move the scalar from the position that the insert
// element used. Therefore, our new insert element occurs at the shuffle's
// mask index value, not the insert's index value.
// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf));
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
Op1, Op0.getOperand(1), NewInsIndex);
/// If we have a unary shuffle of a shuffle, see if it can be folded away
/// completely. This has the potential to lose undef knowledge because the first
/// shuffle may not have an undef mask element where the second one does. So
/// only call this after doing simplifications based on demanded elements.
static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) {
// shuf (shuf0 X, Y, Mask0), undef, Mask
auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
if (!Shuf0 || !Shuf->getOperand(1).isUndef())
return SDValue();
ArrayRef<int> Mask = Shuf->getMask();
ArrayRef<int> Mask0 = Shuf0->getMask();
for (int i = 0, e = (int)Mask.size(); i != e; ++i) {
// Ignore undef elements.
if (Mask[i] == -1)
assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value");
// Is the element of the shuffle operand chosen by this shuffle the same as
// the element chosen by the shuffle operand itself?
if (Mask0[Mask[i]] != Mask0[i])
return SDValue();
// Every element of this shuffle is identical to the result of the previous
// shuffle, so we can replace this value.
return Shuf->getOperand(0);
SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
EVT VT = N->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
// Canonicalize shuffle undef, undef -> undef
if (N0.isUndef() && N1.isUndef())
return DAG.getUNDEF(VT);
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
// Canonicalize shuffle v, v -> v, undef
if (N0 == N1) {
SmallVector<int, 8> NewMask;
for (unsigned i = 0; i != NumElts; ++i) {
int Idx = SVN->getMaskElt(i);
if (Idx >= (int)NumElts) Idx -= NumElts;
return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask);
// Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
if (N0.isUndef())
return DAG.getCommutedVectorShuffle(*SVN);
// Remove references to rhs if it is undef
if (N1.isUndef()) {
bool Changed = false;
SmallVector<int, 8> NewMask;
for (unsigned i = 0; i != NumElts; ++i) {
int Idx = SVN->getMaskElt(i);
if (Idx >= (int)NumElts) {
Idx = -1;
Changed = true;
if (Changed)
return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
return InsElt;
// A shuffle of a single vector that is a splatted value can always be folded.
if (SDValue V = combineShuffleOfSplatVal(SVN, DAG))
return V;
if (SDValue V = formSplatFromShuffles(SVN, DAG))
return V;
// If it is a splat, check if the argument vector is another splat or a
// build_vector.
if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
int SplatIndex = SVN->getSplatIndex();
if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) {
// splat (vector_bo L, R), Index -->
// splat (scalar_bo (extelt L, Index), (extelt R, Index))
SDValue L = N0.getOperand(0), R = N0.getOperand(1);
SDLoc DL(N);
EVT EltVT = VT.getScalarType();
SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);
SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR,
SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
// If this is a bit convert that changes the element type of the vector but
// not the number of vector elements, look through it. Be careful not to
// look though conversions that change things like v4f32 to v2f64.
SDNode *V = N0.getNode();
if (V->getOpcode() == ISD::BITCAST) {
SDValue ConvInput = V->getOperand(0);
if (ConvInput.getValueType().isVector() &&
ConvInput.getValueType().getVectorNumElements() == NumElts)
V = ConvInput.getNode();
if (V->getOpcode() == ISD::BUILD_VECTOR) {
assert(V->getNumOperands() == NumElts &&
"BUILD_VECTOR has wrong number of operands");
SDValue Base;
bool AllSame = true;
for (unsigned i = 0; i != NumElts; ++i) {
if (!V->getOperand(i).isUndef()) {
Base = V->getOperand(i);
// Splat of <u, u, u, u>, return <u, u, u, u>
if (!Base.getNode())
return N0;
for (unsigned i = 0; i != NumElts; ++i) {
if (V->getOperand(i) != Base) {
AllSame = false;
// Splat of <x, x, x, x>, return <x, x, x, x>
if (AllSame)
return N0;
// Canonicalize any other splat as a build_vector.
SDValue Splatted = V->getOperand(SplatIndex);
SmallVector<SDValue, 8> Ops(NumElts, Splatted);
SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
// We may have jumped through bitcasts, so the type of the
// BUILD_VECTOR may not match the type of the shuffle.
if (V->getValueType(0) != VT)
NewBV = DAG.getBitcast(VT, NewBV);
return NewBV;
// Simplify source operands based on shuffle mask.
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
// This is intentionally placed after demanded elements simplification because
// it could eliminate knowledge of undef elements created by this shuffle.
if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN))
return ShufOp;
// Match shuffles that can be converted to any_vector_extend_in_reg.
if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
return V;
// Combine "truncate_vector_in_reg" style shuffles.
if (SDValue V = combineTruncationShuffle(SVN, DAG))
return V;
if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
Level < AfterLegalizeVectorOps &&
(N1.isUndef() ||
(N1.getOpcode() == ISD::CONCAT_VECTORS &&
N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
if (SDValue V = partitionShuffleOfConcats(N, DAG))
return V;
// A shuffle of a concat of the same narrow vector can be reduced to use
// only low-half elements of a concat with undef:
// shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask'
if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() &&
N0.getNumOperands() == 2 &&
N0.getOperand(0) == N0.getOperand(1)) {
int HalfNumElts = (int)NumElts / 2;
SmallVector<int, 8> NewMask;
for (unsigned i = 0; i != NumElts; ++i) {
int Idx = SVN->getMaskElt(i);
if (Idx >= HalfNumElts) {
assert(Idx < (int)NumElts && "Shuffle mask chooses undef op");
Idx -= HalfNumElts;
if (TLI.isShuffleMaskLegal(NewMask, VT)) {
SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType());
SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
N0.getOperand(0), UndefVec);
return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask);
// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
return Res;
// If this shuffle only has a single input that is a bitcasted shuffle,
// attempt to merge the 2 shuffles and suitably bitcast the inputs/output
// back to their original types.
if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
N1.isUndef() && Level < AfterLegalizeVectorOps &&
TLI.isTypeLegal(VT)) {
SDValue BC0 = peekThroughOneUseBitcasts(N0);
if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
EVT SVT = VT.getScalarType();
EVT InnerVT = BC0->getValueType(0);
EVT InnerSVT = InnerVT.getScalarType();
// Determine which shuffle works with the smaller scalar type.
EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
EVT ScaleSVT = ScaleVT.getScalarType();
if (TLI.isTypeLegal(ScaleVT) &&
0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();
// Scale the shuffle masks to the smaller scalar type.
ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
SmallVector<int, 8> InnerMask;
SmallVector<int, 8> OuterMask;
narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask);
narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask);
// Merge the shuffle masks.
SmallVector<int, 8> NewMask;
for (int M : OuterMask)
NewMask.push_back(M < 0 ? -1 : InnerMask[M]);
// Test for shuffle mask legality over both commutations.
SDValue SV0 = BC0->getOperand(0);
SDValue SV1 = BC0->getOperand(1);
bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
if (!LegalMask) {
std::swap(SV0, SV1);
LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
if (LegalMask) {
SV0 = DAG.getBitcast(ScaleVT, SV0);
SV1 = DAG.getBitcast(ScaleVT, SV1);
return DAG.getBitcast(
VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
// Canonicalize shuffles according to rules:
// shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
// shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
// shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
TLI.isTypeLegal(VT)) {
// The incoming shuffle must be of the same type as the result of the
// current shuffle.
assert(N1->getOperand(0).getValueType() == VT &&
"Shuffle types don't match");
SDValue SV0 = N1->getOperand(0);
SDValue SV1 = N1->getOperand(1);
bool HasSameOp0 = N0 == SV0;
bool IsSV1Undef = SV1.isUndef();
if (HasSameOp0 || IsSV1Undef || N0 == SV1)
// Commute the operands of this shuffle so that next rule
// will trigger.
return DAG.getCommutedVectorShuffle(*SVN);
// Try to fold according to rules:
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
// Don't try to fold shuffles with illegal type.
// Only fold if this shuffle is the only user of the other shuffle.
if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
// Don't try to fold splats; they're likely to simplify somehow, or they
// might be free.
if (OtherSV->isSplat())
return SDValue();
// The incoming shuffle must be of the same type as the result of the
// current shuffle.
assert(OtherSV->getOperand(0).getValueType() == VT &&
"Shuffle types don't match");
SDValue SV0, SV1;
SmallVector<int, 4> Mask;
// Compute the combined shuffle mask for a shuffle with SV0 as the first
// operand, and SV1 as the second operand.
for (unsigned i = 0; i != NumElts; ++i) {
int Idx = SVN->getMaskElt(i);
if (Idx < 0) {
// Propagate Undef.
SDValue CurrentVec;
if (Idx < (int)NumElts) {
// This shuffle index refers to the inner shuffle N0. Lookup the inner
// shuffle mask to identify which vector is actually referenced.
Idx = OtherSV->getMaskElt(Idx);
if (Idx < 0) {
// Propagate Undef.
CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
: OtherSV->getOperand(1);
} else {
// This shuffle index references an element within N1.
CurrentVec = N1;
// Simple case where 'CurrentVec' is UNDEF.
if (CurrentVec.isUndef()) {
// Canonicalize the shuffle index. We don't know yet if CurrentVec
// will be the first or second operand of the combined shuffle.
Idx = Idx % NumElts;
if (!SV0.getNode() || SV0 == CurrentVec) {
// Ok. CurrentVec is the left hand side.
// Update the mask accordingly.
SV0 = CurrentVec;
// Bail out if we cannot convert the shuffle pair into a single shuffle.
if (SV1.getNode() && SV1 != CurrentVec)
return SDValue();
// Ok. CurrentVec is the right hand side.
// Update the mask accordingly.
SV1 = CurrentVec;
Mask.push_back(Idx + NumElts);
// Check if all indices in Mask are Undef. In case, propagate Undef.
bool isUndefMask = true;
for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
isUndefMask &= Mask[i] < 0;
if (isUndefMask)
return DAG.getUNDEF(VT);
if (!SV0.getNode())
if (!SV1.getNode())
// Avoid introducing shuffles with illegal mask.
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG);
if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
return V;
return SDValue();
SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
SDValue InVal = N->getOperand(0);
EVT VT = N->getValueType(0);
// with a VECTOR_SHUFFLE and possible truncate.
if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
VT.isFixedLengthVector() &&
InVal->getOperand(0).getValueType().isFixedLengthVector()) {
SDValue InVec = InVal->getOperand(0);
SDValue EltNo = InVal->getOperand(1);
auto InVecT = InVec.getValueType();
if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) {
SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
int Elt = C0->getZExtValue();
NewMask[0] = Elt;
// If we have an implict truncate do truncate here as long as it's legal.
// if it's not legal, this should
if (VT.getScalarType() != InVal.getValueType() &&
InVal.getValueType().isScalarInteger() &&
isTypeLegal(VT.getScalarType())) {
SDValue Val =
DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
if (VT.getScalarType() == InVecT.getScalarType() &&
VT.getVectorNumElements() <= InVecT.getVectorNumElements()) {
SDValue LegalShuffle =
TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec,
DAG.getUNDEF(InVecT), NewMask, DAG);
if (LegalShuffle) {
// If the initial vector is the correct size this shuffle is a
// valid result.
if (VT == InVecT)
return LegalShuffle;
// If not we must truncate the vector.
if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N));
EVT SubVT = EVT::getVectorVT(*DAG.getContext(),
LegalShuffle, ZeroIdx);
return SDValue();
SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
uint64_t InsIdx = N->getConstantOperandVal(2);
// If inserting an UNDEF, just return the original vector.
if (N1.isUndef())
return N0;
// If this is an insert of an extracted vector into an undef vector, we can
// just use the input to the extract.
if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
return N1.getOperand(0);
// If we are inserting a bitcast value into an undef, with the same
// number of elements, just use the bitcast input of the extract.
if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOperand(0).getOperand(1) == N2 &&
N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
VT.getVectorNumElements() &&
N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
VT.getSizeInBits()) {
return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
// If both N1 and N2 are bitcast values on which insert_subvector
// would makes sense, pull the bitcast through.
if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
SDValue CN0 = N0.getOperand(0);
SDValue CN1 = N1.getOperand(0);
EVT CN0VT = CN0.getValueType();
EVT CN1VT = CN1.getValueType();
if (CN0VT.isVector() && CN1VT.isVector() &&
CN0VT.getVectorElementType() == CN1VT.getVectorElementType() &&
CN0VT.getVectorNumElements() == VT.getVectorNumElements()) {
CN0.getValueType(), CN0, CN1, N2);
return DAG.getBitcast(VT, NewINSERT);
// Combine INSERT_SUBVECTORs where we are inserting to the same index.
// --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
N0.getOperand(1).getValueType() == N1.getValueType() &&
N0.getOperand(2) == N2)
return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
N1, N2);
// Eliminate an intermediate insert into an undef vector:
// insert_subvector undef, (insert_subvector undef, X, 0), N2 -->
// insert_subvector undef, X, N2
if (N0.isUndef() && N1.getOpcode() == ISD::INSERT_SUBVECTOR &&
N1.getOperand(0).isUndef() && isNullConstant(N1.getOperand(2)))
return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0,
N1.getOperand(1), N2);
// Push subvector bitcasts to the output, adjusting the index as we go.
// insert_subvector(bitcast(v), bitcast(s), c1)
// -> bitcast(insert_subvector(v, s, c2))
if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) &&
N1.getOpcode() == ISD::BITCAST) {
SDValue N0Src = peekThroughBitcasts(N0);
SDValue N1Src = peekThroughBitcasts(N1);
EVT N0SrcSVT = N0Src.getValueType().getScalarType();
EVT N1SrcSVT = N1Src.getValueType().getScalarType();
if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) &&
N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) {
SDLoc DL(N);
SDValue NewIdx;
LLVMContext &Ctx = *DAG.getContext();
unsigned NumElts = VT.getVectorNumElements();
unsigned EltSizeInBits = VT.getScalarSizeInBits();
if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) {
unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits();
NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale);
NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL);
} else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) {
unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits;
if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) {
NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale);
NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL);
if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) {
SDValue Res = DAG.getBitcast(NewVT, N0Src);
Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx);
return DAG.getBitcast(VT, Res);
// Canonicalize insert_subvector dag nodes.
// Example:
// (insert_subvector (insert_subvector A, Idx0), Idx1)
// -> (insert_subvector (insert_subvector A, Idx1), Idx0)
if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
N1.getValueType() == N0.getOperand(1).getValueType()) {
unsigned OtherIdx = N0.getConstantOperandVal(2);
if (InsIdx < OtherIdx) {
// Swap nodes.
N0.getOperand(0), N1, N2);
return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
VT, NewOp, N0.getOperand(1), N0.getOperand(2));
// If the input vector is a concatenation, and the insert replaces
// one of the pieces, we can optimize into a single concat_vectors.
if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
N0.getOperand(0).getValueType() == N1.getValueType()) {
unsigned Factor = N1.getValueType().getVectorNumElements();
SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
Ops[InsIdx / Factor] = N1;
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
// Simplify source operands based on insertion.
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
return SDValue();
SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
SDValue N0 = N->getOperand(0);
// fold (fp_to_fp16 (fp16_to_fp op)) -> op
if (N0->getOpcode() == ISD::FP16_TO_FP)
return N0->getOperand(0);
return SDValue();
SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
SDValue N0 = N->getOperand(0);
// fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
if (N0->getOpcode() == ISD::AND) {
ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
if (AndConst && AndConst->getAPIntValue() == 0xffff) {
return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
return SDValue();
SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N0.getValueType();
unsigned Opcode = N->getOpcode();
// VECREDUCE over 1-element vector is just an extract.
if (VT.getVectorNumElements() == 1) {
SDLoc dl(N);
SDValue Res =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,
DAG.getVectorIdxConstant(0, dl));
if (Res.getValueType() != N->getValueType(0))
Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res);
return Res;
// On an boolean vector an and/or reduction is the same as a umin/umax
// reduction. Convert them if the latter is legal while the former isn't.
if (Opcode == ISD::VECREDUCE_AND || Opcode == ISD::VECREDUCE_OR) {
unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND
if (!TLI.isOperationLegalOrCustom(Opcode, VT) &&
TLI.isOperationLegalOrCustom(NewOpcode, VT) &&
DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits())
return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0);
return SDValue();
/// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
/// with the destination vector and a zero vector.
/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
/// vector_shuffle V, Zero, <0, 4, 2, 4>
SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = peekThroughBitcasts(N->getOperand(1));
SDLoc DL(N);
// Make sure we're not running after operation legalization where it
// may have custom lowered the vector shuffles.
if (LegalOperations)
return SDValue();
if (RHS.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
EVT RVT = RHS.getValueType();
unsigned NumElts = RHS.getNumOperands();
// Attempt to create a valid clear mask, splitting the mask into
// sub elements and checking to see if each is
// all zeros or all ones - suitable for shuffle masking.
auto BuildClearMask = [&](int Split) {
int NumSubElts = NumElts * Split;
int NumSubBits = RVT.getScalarSizeInBits() / Split;
SmallVector<int, 8> Indices;
for (int i = 0; i != NumSubElts; ++i) {
int EltIdx = i / Split;
int SubIdx = i % Split;
SDValue Elt = RHS.getOperand(EltIdx);
// X & undef --> 0 (not undef). So this lane must be converted to choose
// from the zero constant vector (same as if the element had all 0-bits).
if (Elt.isUndef()) {
Indices.push_back(i + NumSubElts);
APInt Bits;
if (isa<ConstantSDNode>(Elt))
Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
else if (isa<ConstantFPSDNode>(Elt))
Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
return SDValue();
// Extract the sub element from the constant bit mask.
if (DAG.getDataLayout().isBigEndian())
Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits);
Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits);
if (Bits.isAllOnesValue())
else if (Bits == 0)
Indices.push_back(i + NumSubElts);
return SDValue();
// Let's see if the target supports this vector_shuffle.
EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
return SDValue();
SDValue Zero = DAG.getConstant(0, DL, ClearVT);
return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL,
DAG.getBitcast(ClearVT, LHS),
Zero, Indices));
// Determine maximum split level (byte level masking).
int MaxSplit = 1;
if (RVT.getScalarSizeInBits() % 8 == 0)
MaxSplit = RVT.getScalarSizeInBits() / 8;
for (int Split = 1; Split <= MaxSplit; ++Split)
if (RVT.getScalarSizeInBits() % Split == 0)
if (SDValue S = BuildClearMask(Split))
return S;
return SDValue();
/// If a vector binop is performed on splat values, it may be profitable to
/// extract, scalarize, and insert/splat.
static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
unsigned Opcode = N->getOpcode();
EVT VT = N->getValueType(0);
EVT EltVT = VT.getVectorElementType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// TODO: Remove/replace the extract cost check? If the elements are available
// as scalars, then there may be no extract cost. Should we ask if
// inserting a scalar back into a vector is cheap instead?
int Index0, Index1;
SDValue Src0 = DAG.getSplatSourceVector(N0, Index0);
SDValue Src1 = DAG.getSplatSourceVector(N1, Index1);
if (!Src0 || !Src1 || Index0 != Index1 ||
Src0.getValueType().getVectorElementType() != EltVT ||
Src1.getValueType().getVectorElementType() != EltVT ||
!TLI.isExtractVecEltCheap(VT, Index0) ||
!TLI.isOperationLegalOrCustom(Opcode, EltVT))
return SDValue();
SDLoc DL(N);
SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
// If all lanes but 1 are undefined, no need to splat the scalar result.
// TODO: Keep track of undefs and use that info in the general case.
if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
// bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
// build_vec ..undef, (bo X, Y), undef...
SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
Ops[Index0] = ScalarBO;
return DAG.getBuildVector(VT, DL, Ops);
// bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
return DAG.getBuildVector(VT, DL, Ops);
/// Visit a binary vector operation, like ADD.
SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
assert(N->getValueType(0).isVector() &&
"SimplifyVBinOp only works on vectors!");
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
SDValue Ops[] = {LHS, RHS};
EVT VT = N->getValueType(0);
unsigned Opcode = N->getOpcode();
SDNodeFlags Flags = N->getFlags();
// See if we can constant fold the vector operation.
if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
Opcode, SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags()))
return Fold;
// Move unary shuffles with identical masks after a vector binop:
// VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask))
// --> shuffle (VBinOp A, B), Undef, Mask
// This does not require type legality checks because we are creating the
// same types of operations that are in the original sequence. We do have to
// restrict ops like integer div that have immediate UB (eg, div-by-zero)
// though. This code is adapted from the identical transform in instcombine.
if (Opcode != ISD::UDIV && Opcode != ISD::SDIV &&
Opcode != ISD::UREM && Opcode != ISD::SREM &&
Opcode != ISD::UDIVREM && Opcode != ISD::SDIVREM) {
auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
(LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
SDLoc DL(N);
SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),
RHS.getOperand(0), Flags);
SDValue UndefV = LHS.getOperand(1);
return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
// Try to sink a splat shuffle after a binop with a uniform constant.
// This is limited to cases where neither the shuffle nor the constant have
// undefined elements because that could be poison-unsafe or inhibit
// demanded elements analysis. It is further limited to not change a splat
// of an inserted scalar because that may be optimized better by
// load-folding or other target-specific behaviors.
if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) &&
Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&
Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
// binop (splat X), (splat C) --> splat (binop X, C)
SDLoc DL(N);
SDValue X = Shuf0->getOperand(0);
SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);
return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) &&
Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&
Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
// binop (splat C), (splat X) --> splat (binop C, X)
SDLoc DL(N);
SDValue X = Shuf1->getOperand(0);
SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);
return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
// The following pattern is likely to emerge with vector reduction ops. Moving
// the binary operation ahead of insertion may allow using a narrower vector
// instruction that has better performance than the wide version of the op:
// VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z
if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() &&
RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() &&
LHS.getOperand(2) == RHS.getOperand(2) &&
(LHS.hasOneUse() || RHS.hasOneUse())) {
SDValue X = LHS.getOperand(1);
SDValue Y = RHS.getOperand(1);
SDValue Z = LHS.getOperand(2);
EVT NarrowVT = X.getValueType();
if (NarrowVT == Y.getValueType() &&
TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
// (binop undef, undef) may not return undef, so compute that result.
SDLoc DL(N);
SDValue VecC =
DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT));
SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
// Make sure all but the first op are undef or constant.
auto ConcatWithConstantOrUndef = [](SDValue Concat) {
return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
std::all_of(std::next(Concat->op_begin()), Concat->op_end(),
[](const SDValue &Op) {
return Op.isUndef() ||
// The following pattern is likely to emerge with vector reduction ops. Moving
// the binary operation ahead of the concat may allow using a narrower vector
// instruction that has better performance than the wide version of the op:
// VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
// concat (VBinOp X, Y), VecC
if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
(LHS.hasOneUse() || RHS.hasOneUse())) {
EVT NarrowVT = LHS.getOperand(0).getValueType();
if (NarrowVT == RHS.getOperand(0).getValueType() &&
TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
SDLoc DL(N);
unsigned NumOperands = LHS.getNumOperands();
SmallVector<SDValue, 4> ConcatOps;
for (unsigned i = 0; i != NumOperands; ++i) {
// This constant fold for operands 1 and up.
ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
if (SDValue V = scalarizeBinOpOfSplats(N, DAG))
return V;
return SDValue();
SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
SDValue N2) {
assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
// If we got a simplified select_cc node back from SimplifySelectCC, then
// break it down into a new SETCC node, and a new SELECT node, and then return
// the SELECT node, since we were called with a SELECT node.
if (SCC.getNode()) {
// Check to see if we got a select_cc back (to turn into setcc/select).
// Otherwise, just return whatever node we got back, like fabs.
if (SCC.getOpcode() == ISD::SELECT_CC) {
const SDNodeFlags Flags = N0.getNode()->getFlags();
SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
SCC.getOperand(0), SCC.getOperand(1),
SCC.getOperand(4), Flags);
SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
SCC.getOperand(2), SCC.getOperand(3));
return SelectNode;
return SCC;
return SDValue();
/// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
/// being selected between, see if we can simplify the select. Callers of this
/// should assume that TheSelect is deleted if this returns true. As such, they
/// should return the appropriate thing (e.g. the node) back to the top-level of
/// the DAG combiner loop to avoid it being looked at.
bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
SDValue RHS) {
// fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
// The select + setcc is redundant, because fsqrt returns NaN for X < 0.
if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
// We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
SDValue Sqrt = RHS;
ISD::CondCode CC;
SDValue CmpLHS;
const ConstantFPSDNode *Zero = nullptr;
if (TheSelect->getOpcode() == ISD::SELECT_CC) {
CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
CmpLHS = TheSelect->getOperand(0);
Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
} else {
SDValue Cmp = TheSelect->getOperand(0);
if (Cmp.getOpcode() == ISD::SETCC) {
CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
CmpLHS = Cmp.getOperand(0);
Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
if (Zero && Zero->isZero() &&
Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT ||
// We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
CombineTo(TheSelect, Sqrt);
return true;
// Cannot simplify select with vector condition
if (TheSelect->getOperand(0).getValueType().isVector()) return false;
// If this is a select from two identical things, try to pull the operation
// through the select.
if (LHS.getOpcode() != RHS.getOpcode() ||
!LHS.hasOneUse() || !RHS.hasOneUse())
return false;
// If this is a load and the token chain is identical, replace the select
// of two loads with a load through a select of the address to load from.
// This triggers in things like "select bool X, 10.0, 123.0" after the FP
// constants have been dropped into the constant pool.
if (LHS.getOpcode() == ISD::LOAD) {
LoadSDNode *LLD = cast<LoadSDNode>(LHS);
LoadSDNode *RLD = cast<LoadSDNode>(RHS);
// Token chains must be identical.
if (LHS.getOperand(0) != RHS.getOperand(0) ||
// Do not let this transformation reduce the number of volatile loads.
// Be conservative for atomics for the moment
// TODO: This does appear to be legal for unordered atomics (see D66309)
!LLD->isSimple() || !RLD->isSimple() ||
// FIXME: If either is a pre/post inc/dec load,
// we'd need to split out the address adjustment.
LLD->isIndexed() || RLD->isIndexed() ||
// If this is an EXTLOAD, the VT's must match.
LLD->getMemoryVT() != RLD->getMemoryVT() ||
// If this is an EXTLOAD, the kind of extension must match.
(LLD->getExtensionType() != RLD->getExtensionType() &&
// The only exception is if one of the extensions is anyext.
LLD->getExtensionType() != ISD::EXTLOAD &&
RLD->getExtensionType() != ISD::EXTLOAD) ||
// FIXME: this discards src value information. This is
// over-conservative. It would be beneficial to be able to remember
// both potential memory locations. Since we are discarding
// src value info, don't do the transformation if the memory
// locations are not in the default address space.
LLD->getPointerInfo().getAddrSpace() != 0 ||
RLD->getPointerInfo().getAddrSpace() != 0 ||
// We can't produce a CMOV of a TargetFrameIndex since we won't
// generate the address generation required.
LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
return false;
// The loads must not depend on one another.
if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD))
return false;
// Check that the select condition doesn't reach either load. If so,
// folding this will induce a cycle into the DAG. If not, this is safe to
// xform, so create a select of the addresses.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
// Always fail if LLD and RLD are not independent. TheSelect is a
// predecessor to all Nodes in question so we need not search past it.
if (SDNode::hasPredecessorHelper(LLD, Visited, Worklist) ||
SDNode::hasPredecessorHelper(RLD, Visited, Worklist))
return false;
SDValue Addr;
if (TheSelect->getOpcode() == ISD::SELECT) {
// We cannot do this optimization if any pair of {RLD, LLD} is a
// predecessor to {RLD, LLD, CondNode}. As we've already compared the
// Loads, we only need to check if CondNode is a successor to one of the
// loads. We can further avoid this if there's no use of their chain
// value.
SDNode *CondNode = TheSelect->getOperand(0).getNode();
if ((LLD->hasAnyUseOfValue(1) &&
SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) ||
(RLD->hasAnyUseOfValue(1) &&
SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
return false;
Addr = DAG.getSelect(SDLoc(TheSelect),
TheSelect->getOperand(0), LLD->getBasePtr(),
} else { // Otherwise SELECT_CC
// We cannot do this optimization if any pair of {RLD, LLD} is a
// predecessor to {RLD, LLD, CondLHS, CondRHS}. As we've already compared
// the Loads, we only need to check if CondLHS/CondRHS is a successor to
// one of the loads. We can further avoid this if there's no use of their
// chain value.
SDNode *CondLHS = TheSelect->getOperand(0).getNode();
SDNode *CondRHS = TheSelect->getOperand(1).getNode();
if ((LLD->hasAnyUseOfValue(1) &&
SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) ||
(RLD->hasAnyUseOfValue(1) &&
SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
return false;
Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
LLD->getBasePtr(), RLD->getBasePtr(),
SDValue Load;
// It is safe to replace the two loads if they have different alignments,
// but the new load must be the minimum (most restrictive) alignment of the
// inputs.
unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
if (!RLD->isInvariant())
MMOFlags &= ~MachineMemOperand::MOInvariant;
if (!RLD->isDereferenceable())
MMOFlags &= ~MachineMemOperand::MODereferenceable;
if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
// FIXME: Discards pointer and AA info.
Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
} else {
// FIXME: Discards pointer and AA info.
Load = DAG.getExtLoad(
LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
: LLD->getExtensionType(),
SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
// Users of the select now use the result of the load.
CombineTo(TheSelect, Load);
// Users of the old loads now use the new load's chain. We know the
// old-load value is dead now.
CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
return true;
return false;
/// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and
/// bitwise 'and'.
SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
SDValue N1, SDValue N2, SDValue N3,
ISD::CondCode CC) {
// If this is a select where the false operand is zero and the compare is a
// check of the sign bit, see if we can perform the "gzip trick":
// select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A
// select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A
EVT XType = N0.getValueType();
EVT AType = N2.getValueType();
if (!isNullConstant(N3) || !XType.bitsGE(AType))
return SDValue();
// If the comparison is testing for a positive value, we have to invert
// the sign bit mask, so only do that transform if the target has a bitwise
// 'and not' instruction (the invert is free).
if (CC == ISD::SETGT && TLI.hasAndNot(N2)) {
// (X > -1) ? A : 0
// (X > 0) ? X : 0 <-- This is canonical signed max.
if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2)))
return SDValue();
} else if (CC == ISD::SETLT) {
// (X < 0) ? A : 0
// (X < 1) ? X : 0 <-- This is un-canonicalized signed min.
if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2)))
return SDValue();
} else {
return SDValue();
// and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
// constant.
EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) {
SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
if (XType.bitsGT(AType)) {
Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
if (CC == ISD::SETGT)
Shift = DAG.getNOT(DL, Shift, AType);
return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
unsigned ShCt = XType.getSizeInBits() - 1;
if (TLI.shouldAvoidTransformToShift(XType, ShCt))
return SDValue();
SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
if (XType.bitsGT(AType)) {
Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
if (CC == ISD::SETGT)
Shift = DAG.getNOT(DL, Shift, AType);
return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
/// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
/// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
/// in it. This may be a win when the constant is not otherwise available
/// because it replaces two constant pool loads with one.
SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
ISD::CondCode CC) {
if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType()))
return SDValue();
// If we are before legalize types, we want the other legalization to happen
// first (for example, to avoid messing with soft float).
auto *TV = dyn_cast<ConstantFPSDNode>(N2);
auto *FV = dyn_cast<ConstantFPSDNode>(N3);
EVT VT = N2.getValueType();
if (!TV || !FV || !TLI.isTypeLegal(VT))
return SDValue();
// If a constant can be materialized without loads, this does not make sense.
if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal ||
TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) ||
TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize))
return SDValue();
// If both constants have multiple uses, then we won't need to do an extra
// load. The values are likely around in registers for other users.
if (!TV->hasOneUse() && !FV->hasOneUse())
return SDValue();
Constant *Elts[] = { const_cast<ConstantFP*>(FV->getConstantFPValue()),
const_cast<ConstantFP*>(TV->getConstantFPValue()) };
Type *FPTy = Elts[0]->getType();
const DataLayout &TD = DAG.getDataLayout();
// Create a ConstantArray of the two constants.
Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
// Get offsets to the 0 and 1 elements of the array, so we can select between
// them.
SDValue Zero = DAG.getIntPtrConstant(0, DL);
unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
SDValue Cond =
DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC);
SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero);
CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset);
return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
DAG.getMachineFunction()), Alignment);
/// Simplify an expression of the form (N0 cond N1) ? N2 : N3
/// where 'cond' is the comparison specified by CC.
SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
SDValue N2, SDValue N3, ISD::CondCode CC,
bool NotExtCompare) {
// (x ? y : y) -> y.
if (N2 == N3) return N2;
EVT CmpOpVT = N0.getValueType();
EVT CmpResVT = getSetCCResultType(CmpOpVT);
EVT VT = N2.getValueType();
auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
// Determine if the condition we're dealing with is constant.
if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) {
if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) {
// fold select_cc true, x, y -> x
// fold select_cc false, x, y -> y
return !(SCCC->isNullValue()) ? N2 : N3;
if (SDValue V =
convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC))
return V;
if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
return V;
// fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
// where y is has a single bit set.
// A plaintext description would be, we can turn the SELECT_CC into an AND
// when the condition can be materialized as an all-ones register. Any
// single bit-test can be materialized as an all-ones register with
// shift-left and shift-right-arith.
if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
SDValue AndLHS = N0->getOperand(0);
auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
// Shift the tested bit over the sign bit.
const APInt &AndMask = ConstAndRHS->getAPIntValue();
unsigned ShCt = AndMask.getBitWidth() - 1;
if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
SDValue ShlAmt =
DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);
// Now arithmetic right shift it all the way over, so the result is
// either all-ones, or zero.
SDValue ShrAmt =
DAG.getConstant(ShCt, SDLoc(Shl),
SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);
return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
// fold select C, 16, 0 -> shl C, 4
bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2();
bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2();
if ((Fold || Swap) &&
TLI.getBooleanContents(CmpOpVT) ==
TargetLowering::ZeroOrOneBooleanContent &&
(!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) {
if (Swap) {
CC = ISD::getSetCCInverse(CC, CmpOpVT);
std::swap(N2C, N3C);
// If the caller doesn't want us to simplify this into a zext of a compare,
// don't do it.
if (NotExtCompare && N2C->isOne())
return SDValue();
SDValue Temp, SCC;
// zext (setcc n0, n1)
if (LegalTypes) {
SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC);
if (VT.bitsLT(SCC.getValueType()))
Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT);
Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
} else {
SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
if (N2C->isOne())
return Temp;
unsigned ShCt = N2C->getAPIntValue().logBase2();
if (TLI.shouldAvoidTransformToShift(VT, ShCt))
return SDValue();
// shl setcc result by log2 n2c
return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
DAG.getConstant(ShCt, SDLoc(Temp),
// select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
// select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
// select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
// select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
// select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
// select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
// select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
// select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
SDValue ValueOnZero = N2;
SDValue Count = N3;
// If the condition is NE instead of E, swap the operands.
if (CC == ISD::SETNE)
std::swap(ValueOnZero, Count);
// Check if the value on zero is a constant equal to the bits in the type.
if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
// If the other operand is cttz/cttz_zero_undef of N0, and cttz is
// legal, combine to just cttz.
if ((Count.getOpcode() == ISD::CTTZ ||
Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
N0 == Count.getOperand(0) &&
(!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT)))
return DAG.getNode(ISD::CTTZ, DL, VT, N0);
// If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
// legal, combine to just ctlz.
if ((Count.getOpcode() == ISD::CTLZ ||
Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
N0 == Count.getOperand(0) &&
(!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT)))
return DAG.getNode(ISD::CTLZ, DL, VT, N0);
return SDValue();
/// This is a stub for TargetLowering::SimplifySetCC.
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, const SDLoc &DL,
bool foldBooleans) {
DagCombineInfo(DAG, Level, false, this);
return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
/// Given an ISD::SDIV node expressing a divide by constant, return
/// a DAG expression to select that will generate the same value by multiplying
/// by a magic number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
SDValue DAGCombiner::BuildSDIV(SDNode *N) {
// when optimising for minimum size, we don't want to expand a div to a mul
// and a shift.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
SmallVector<SDNode *, 8> Built;
if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) {
for (SDNode *N : Built)
return S;
return SDValue();
/// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
/// DAG expression that will generate the same value by right shifting.
SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
if (!C)
return SDValue();
// Avoid division by zero.
if (C->isNullValue())
return SDValue();
SmallVector<SDNode *, 8> Built;
if (SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built)) {
for (SDNode *N : Built)
return S;
return SDValue();
/// Given an ISD::UDIV node expressing a divide by constant, return a DAG
/// expression that will generate the same value by multiplying by a magic
/// number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
SDValue DAGCombiner::BuildUDIV(SDNode *N) {
// when optimising for minimum size, we don't want to expand a div to a mul
// and a shift.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
SmallVector<SDNode *, 8> Built;
if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) {
for (SDNode *N : Built)
return S;
return SDValue();
/// Determines the LogBase2 value for a non-null input value using the
/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
EVT VT = V.getValueType();
unsigned EltBits = VT.getScalarSizeInBits();
SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
SDValue Base = DAG.getConstant(EltBits - 1, DL, VT);
SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
return LogBase2;
/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
/// For the reciprocal, we need to find the zero of the function:
/// F(X) = A X - 1 [which has a zero at X = 1/A]
/// =>
/// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
/// does not require additional intermediate precision]
/// For the last iteration, put numerator N into it to gain more precision:
/// Result = N X_i + X_i (N - N A X_i)
SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
SDNodeFlags Flags) {
if (LegalDAG)
return SDValue();
// TODO: Handle half and/or extended types?
EVT VT = Op.getValueType();
if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
return SDValue();
// If estimates are explicitly disabled for this function, we're done.
MachineFunction &MF = DAG.getMachineFunction();
int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF);
if (Enabled == TLI.ReciprocalEstimate::Disabled)
return SDValue();
// Estimates may be explicitly enabled for this type with a custom number of
// refinement steps.
int Iterations = TLI.getDivRefinementSteps(VT, MF);
if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
SDLoc DL(Op);
if (Iterations) {
SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
// Newton iterations: Est = Est + Est (N - Arg * Est)
// If this is the last iteration, also multiply by the numerator.
for (int i = 0; i < Iterations; ++i) {
SDValue MulEst = Est;
if (i == Iterations - 1) {
MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags);
SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags);
NewEst = DAG.getNode(ISD::FSUB, DL, VT,
(i == Iterations - 1 ? N : FPOne), NewEst, Flags);
NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags);
} else {
// If no iterations are available, multiply with N.
Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags);
return Est;
return SDValue();
/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
/// For the reciprocal sqrt, we need to find the zero of the function:
/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
/// =>
/// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
/// As a result, we precompute A/2 prior to the iteration loop.
SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
unsigned Iterations,
SDNodeFlags Flags, bool Reciprocal) {
EVT VT = Arg.getValueType();
SDLoc DL(Arg);
SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
// We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
// this entire sequence requires only one FP constant.
SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
// Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
for (unsigned i = 0; i < Iterations; ++i) {
SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
// If non-reciprocal square root is requested, multiply the result by Arg.
if (!Reciprocal)
Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
return Est;
/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
/// For the reciprocal sqrt, we need to find the zero of the function:
/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
/// =>
/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
unsigned Iterations,
SDNodeFlags Flags, bool Reciprocal) {
EVT VT = Arg.getValueType();
SDLoc DL(Arg);
SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);
// This routine must enter the loop below to work correctly
// when (Reciprocal == false).
assert(Iterations > 0);
// Newton iterations for reciprocal square root:
// E = (E * -0.5) * ((A * E) * E + -3.0)
for (unsigned i = 0; i < Iterations; ++i) {
SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
// When calculating a square root at the last iteration build:
// S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
// (notice a common subexpression)
SDValue LHS;
if (Reciprocal || (i + 1) < Iterations) {
// RSQRT: LHS = (E * -0.5)
LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
} else {
// SQRT: LHS = (A * E) * -0.5
LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
return Est;
/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
/// Op can be zero.
SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
bool Reciprocal) {
if (LegalDAG)
return SDValue();
// TODO: Handle half and/or extended types?
EVT VT = Op.getValueType();
if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
return SDValue();
// If estimates are explicitly disabled for this function, we're done.
MachineFunction &MF = DAG.getMachineFunction();
int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF);
if (Enabled == TLI.ReciprocalEstimate::Disabled)
return SDValue();
// Estimates may be explicitly enabled for this type with a custom number of
// refinement steps.
int Iterations = TLI.getSqrtRefinementSteps(VT, MF);
bool UseOneConstNR = false;
if (SDValue Est =
TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
Reciprocal)) {
if (Iterations) {
Est = UseOneConstNR
? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
: buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
if (!Reciprocal) {
// The estimate is now completely wrong if the input was exactly 0.0 or
// possibly a denormal. Force the answer to 0.0 for those cases.
SDLoc DL(Op);
EVT CCVT = getSetCCResultType(VT);
ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
DenormalMode DenormMode = DAG.getDenormalMode(VT);
if (DenormMode.Input == DenormalMode::IEEE) {
// This is specifically a check for the handling of denormal inputs,
// not the result.
// fabs(X) < SmallestNormal ? 0.0 : Est
const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
} else {
// X == 0.0 ? 0.0 : Est
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
return Est;
return SDValue();
SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
return buildSqrtEstimateImpl(Op, Flags, true);
SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
return buildSqrtEstimateImpl(Op, Flags, false);
/// Return true if there is any possibility that the two addresses overlap.
bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
struct MemUseCharacteristics {
bool IsVolatile;
bool IsAtomic;
SDValue BasePtr;
int64_t Offset;
Optional<int64_t> NumBytes;
MachineMemOperand *MMO;
auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
int64_t Offset = 0;
if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
Offset = (LSN->getAddressingMode() == ISD::PRE_INC)
? C->getSExtValue()
: (LSN->getAddressingMode() == ISD::PRE_DEC)
? -1 * C->getSExtValue()
: 0;
uint64_t Size =
return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),
Offset /*base offset*/,
if (const auto *LN = cast<LifetimeSDNode>(N))
return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1),
(LN->hasOffset()) ? LN->getOffset() : 0,
(LN->hasOffset()) ? Optional<int64_t>(LN->getSize())
: Optional<int64_t>(),
(MachineMemOperand *)nullptr};
// Default.
return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(),
(int64_t)0 /*offset*/,
Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr};
MemUseCharacteristics MUC0 = getCharacteristics(Op0),
MUC1 = getCharacteristics(Op1);
// If they are to the same address, then they must be aliases.
if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr &&
MUC0.Offset == MUC1.Offset)
return true;
// If they are both volatile then they cannot be reordered.
if (MUC0.IsVolatile && MUC1.IsVolatile)
return true;
// Be conservative about atomics for the moment
// TODO: This is way overconservative for unordered atomics (see D66309)
if (MUC0.IsAtomic && MUC1.IsAtomic)
return true;
if (MUC0.MMO && MUC1.MMO) {
if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
(MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
return false;
// Try to prove that there is aliasing, or that there is no aliasing. Either
// way, we can return now. If nothing can be proved, proceed with more tests.
bool IsAlias;
if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes,
DAG, IsAlias))
return IsAlias;
// The following all rely on MMO0 and MMO1 being valid. Fail conservatively if
// either are not known.
if (!MUC0.MMO || !MUC1.MMO)
return true;
// If one operation reads from invariant memory, and the other may store, they
// cannot alias. These should really be checking the equivalent of mayWrite,
// but it only matters for memory nodes other than load /store.
if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
(MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
return false;
// If we know required SrcValue1 and SrcValue2 have relatively large
// alignment compared to the size and offset of the access, we may be able
// to prove they do not alias. This check is conservative for now to catch
// cases created by splitting vector types, it only works when the offsets are
// multiples of the size of the data.
int64_t SrcValOffset0 = MUC0.MMO->getOffset();
int64_t SrcValOffset1 = MUC1.MMO->getOffset();
Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
auto &Size0 = MUC0.NumBytes;
auto &Size1 = MUC1.NumBytes;
if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
Size0.hasValue() && Size1.hasValue() && *Size0 == *Size1 &&
OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
SrcValOffset1 % *Size1 == 0) {
int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
// There is no overlap between these relatively aligned accesses of
// similar size. Return no alias.
if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)
return false;
bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
? CombinerGlobalAA
: DAG.getSubtarget().useAA();
#ifndef NDEBUG
if (CombinerAAOnlyFunc.getNumOccurrences() &&
CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
UseAA = false;
if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
Size0.hasValue() && Size1.hasValue()) {
// Use alias analysis information.
int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
AliasResult AAResult = AA->alias(
MemoryLocation(MUC0.MMO->getValue(), Overlap0,
UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
MemoryLocation(MUC1.MMO->getValue(), Overlap1,
UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes()));
if (AAResult == NoAlias)
return false;
// Otherwise we have to assume they alias.
return true;
/// Walk up chain skipping non-aliasing memory nodes,
/// looking for aliasing nodes and adding them to the Aliases vector.
void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
SmallVectorImpl<SDValue> &Aliases) {
SmallVector<SDValue, 8> Chains; // List of chains to visit.
SmallPtrSet<SDNode *, 16> Visited; // Visited node set.
// Get alias information for node.
// TODO: relax aliasing for unordered atomics (see D66309)
const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple();
// Starting off.
unsigned Depth = 0;
// Attempt to improve chain by a single step
std::function<bool(SDValue &)> ImproveChain = [&](SDValue &C) -> bool {
switch (C.getOpcode()) {
case ISD::EntryToken:
// No need to mark EntryToken.
C = SDValue();
return true;
case ISD::LOAD:
case ISD::STORE: {
// Get alias information for C.
// TODO: Relax aliasing for unordered atomics (see D66309)
bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) {
// Look further up the chain.
C = C.getOperand(0);
return true;
// Alias, so stop here.
return false;
case ISD::CopyFromReg:
// Always forward past past CopyFromReg.
C = C.getOperand(0);
return true;
// We can forward past any lifetime start/end that can be proven not to
// alias the memory access.
if (!isAlias(N, C.getNode())) {
// Look further up the chain.
C = C.getOperand(0);
return true;
return false;
return false;
// Look at each chain and determine if it is an alias. If so, add it to the
// aliases list. If not, then continue up the chain looking for the next
// candidate.
while (!Chains.empty()) {
SDValue Chain = Chains.pop_back_val();
// Don't bother if we've seen Chain before.
if (!Visited.insert(Chain.getNode()).second)
// For TokenFactor nodes, look at each operand and only continue up the
// chain until we reach the depth limit.
// FIXME: The depth check could be made to return the last non-aliasing
// chain we found before we hit a tokenfactor rather than the original
// chain.
if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
if (Chain.getOpcode() == ISD::TokenFactor) {
// We have to check each of the operands of the token factor for "small"
// token factors, so we queue them up. Adding the operands to the queue
// (stack) in reverse order maintains the original order and increases the
// likelihood that getNode will find a matching token factor (CSE.)
if (Chain.getNumOperands() > 16) {
for (unsigned n = Chain.getNumOperands(); n;)
// Everything else
if (ImproveChain(Chain)) {
// Updated Chain Found, Consider new chain if one exists.
if (Chain.getNode())
// No Improved Chain Possible, treat as Alias.
/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
/// (aliasing node.)
SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
if (OptLevel == CodeGenOpt::None)
return OldChain;
// Ops for replacing token factor.
SmallVector<SDValue, 8> Aliases;
// Accumulate all the aliases to this node.
GatherAllAliases(N, OldChain, Aliases);
// If no operands then chain to entry token.
if (Aliases.size() == 0)
return DAG.getEntryNode();
// If a single operand then chain to it. We don't need to revisit it.
if (Aliases.size() == 1)
return Aliases[0];
// Construct a custom tailored token factor.
return DAG.getTokenFactor(SDLoc(N), Aliases);
namespace {
// TODO: Replace with with std::monostate when we move to C++17.
struct UnitT { } Unit;
bool operator==(const UnitT &, const UnitT &) { return true; }
bool operator!=(const UnitT &, const UnitT &) { return false; }
} // namespace
// This function tries to collect a bunch of potentially interesting
// nodes to improve the chains of, all at once. This might seem
// redundant, as this function gets called when visiting every store
// node, so why not let the work be done on each store as it's visited?
// I believe this is mainly important because mergeConsecutiveStores
// is unable to deal with merging stores of different sizes, so unless
// we improve the chains of all the potential candidates up-front
// before running mergeConsecutiveStores, it might only see some of
// the nodes that will eventually be candidates, and then not be able
// to go from a partially-merged state to the desired final
// fully-merged state.
bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
SmallVector<StoreSDNode *, 8> ChainedStores;
StoreSDNode *STChain = St;
// Intervals records which offsets from BaseIndex have been covered. In
// the common case, every store writes to the immediately previous address
// space and thus merged with the previous interval at insertion time.
using IMap =
llvm::IntervalMap<int64_t, UnitT, 8, IntervalMapHalfOpenInfo<int64_t>>;
IMap::Allocator A;
IMap Intervals(A);
// This holds the base pointer, index, and the offset in bytes from the base
// pointer.
const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
// We must have a base and an offset.
if (!BasePtr.getBase().getNode())
return false;
// Do not handle stores to undef base pointers.
if (BasePtr.getBase().isUndef())
return false;
// BaseIndexOffset assumes that offsets are fixed-size, which
// is not valid for scalable vectors where the offsets are
// scaled by `vscale`, so bail out early.
if (St->getMemoryVT().isScalableVector())
return false;
// Add ST's interval.
Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit);
while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) {
// If the chain has more than one use, then we can't reorder the mem ops.
if (!SDValue(Chain, 0)->hasOneUse())
// TODO: Relax for unordered atomics (see D66309)
if (!Chain->isSimple() || Chain->isIndexed())
// Find the base pointer and offset for this memory node.
const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG);
// Check that the base pointer is the same as the original one.
int64_t Offset;
if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset))
int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8;
// Make sure we don't overlap with other intervals by checking the ones to
// the left or right before inserting.
auto I = Intervals.find(Offset);
// If there's a next interval, we should end before it.
if (I != Intervals.end() && I.start() < (Offset + Length))
// If there's a previous interval, we should start after it.
if (I != Intervals.begin() && (--I).stop() <= Offset)
Intervals.insert(Offset, Offset + Length, Unit);
STChain = Chain;
// If we didn't find a chained store, exit.
if (ChainedStores.size() == 0)
return false;
// Improve all chained stores (St and ChainedStores members) starting from
// where the store chain ended and return single TokenFactor.
SDValue NewChain = STChain->getChain();
SmallVector<SDValue, 8> TFOps;
for (unsigned I = ChainedStores.size(); I;) {
StoreSDNode *S = ChainedStores[--I];
SDValue BetterChain = FindBetterChain(S, NewChain);
S = cast<StoreSDNode>(DAG.UpdateNodeOperands(
S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3)));
TFOps.push_back(SDValue(S, 0));
ChainedStores[I] = S;
// Improve St's chain. Use a new node to avoid creating a loop from CombineTo.
SDValue BetterChain = FindBetterChain(St, NewChain);
SDValue NewST;
if (St->isTruncatingStore())
NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(),
St->getBasePtr(), St->getMemoryVT(),
NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(),
St->getBasePtr(), St->getMemOperand());
// If we improved every element of TFOps, then we've lost the dependence on
// NewChain to successors of St and we need to add it back to TFOps. Do so at
// the beginning to keep relative order consistent with FindBetterChains.
auto hasImprovedChain = [&](SDValue ST) -> bool {
return ST->getOperand(0) != NewChain;
bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain);
if (AddNewChain)
TFOps.insert(TFOps.begin(), NewChain);
SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps);
CombineTo(St, TF);
// Add TF and its operands to the worklist.
for (const SDValue &Op : TF->ops())
return true;
bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
if (OptLevel == CodeGenOpt::None)
return false;
const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
// We must have a base and an offset.
if (!BasePtr.getBase().getNode())
return false;
// Do not handle stores to undef base pointers.
if (BasePtr.getBase().isUndef())
return false;
// Directly improve a chain of disjoint stores starting at St.
if (parallelizeChainedStores(St))
return true;
// Improve St's Chain..
SDValue BetterChain = FindBetterChain(St, St->getChain());
if (St->getChain() != BetterChain) {
replaceStoreChain(St, BetterChain);
return true;
return false;
/// This is the entry point for the file.
void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
CodeGenOpt::Level OptLevel) {
/// This is the main entry point to this class.
DAGCombiner(*this, AA, OptLevel).Run(Level);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 414ba25ffd5f..c81d03cac81b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1,5247 +1,5258 @@
//===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file performs vector type splitting and scalarization for LegalizeTypes.
// Scalarization is the act of changing a computation in an illegal one-element
// vector type to be a computation in its scalar element type. For example,
// implementing <1 x f32> arithmetic in a scalar f32 register. This is needed
// as a base case when scalarizing vector arithmetic like <4 x f32>, which
// eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
// types.
// Splitting is the act of changing a computation in an invalid vector type to
// be a computation in two vectors of half the size. For example, implementing
// <128 x f32> operations in terms of two <64 x f32> operations.
#include "LegalizeTypes.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TypeSize.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "legalize-types"
// Result Vector Scalarization: <1 x ty> -> ty.
void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
LLVM_DEBUG(dbgs() << "Scalarize node result " << ResNo << ": "; N->dump(&DAG);
dbgs() << "\n");
SDValue R = SDValue();
switch (N->getOpcode()) {
#ifndef NDEBUG
dbgs() << "ScalarizeVectorResult #" << ResNo << ": ";
dbgs() << "\n";
report_fatal_error("Do not know how to scalarize the result of this "
case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break;
case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break;
case ISD::FP_ROUND: R = ScalarizeVecRes_FP_ROUND(N); break;
case ISD::FPOWI: R = ScalarizeVecRes_FPOWI(N); break;
case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
case ISD::VSELECT: R = ScalarizeVecRes_VSELECT(N); break;
case ISD::SELECT: R = ScalarizeVecRes_SELECT(N); break;
case ISD::SELECT_CC: R = ScalarizeVecRes_SELECT_CC(N); break;
case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break;
case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break;
case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
R = ScalarizeVecRes_VecInregOp(N);
case ISD::ABS:
case ISD::BSWAP:
case ISD::CTLZ:
case ISD::CTPOP:
case ISD::CTTZ:
case ISD::FABS:
case ISD::FCEIL:
case ISD::FCOS:
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FLOG:
case ISD::FLOG10:
case ISD::FLOG2:
case ISD::FNEG:
case ISD::FRINT:
case ISD::FSIN:
case ISD::FSQRT:
R = ScalarizeVecRes_UnaryOp(N);
case ISD::ADD:
case ISD::AND:
case ISD::FADD:
case ISD::FDIV:
case ISD::FMUL:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
case ISD::FPOW:
case ISD::FREM:
case ISD::FSUB:
case ISD::MUL:
case ISD::OR:
case ISD::SDIV:
case ISD::SREM:
case ISD::SUB:
case ISD::UDIV:
case ISD::UREM:
case ISD::XOR:
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
R = ScalarizeVecRes_BinOp(N);
case ISD::FMA:
R = ScalarizeVecRes_TernaryOp(N);
#include "llvm/IR/ConstrainedOps.def"
R = ScalarizeVecRes_StrictFPOp(N);
case ISD::UADDO:
case ISD::SADDO:
case ISD::USUBO:
case ISD::SSUBO:
case ISD::UMULO:
case ISD::SMULO:
R = ScalarizeVecRes_OverflowOp(N, ResNo);
R = ScalarizeVecRes_FIX(N);
// If R is null, the sub-method took care of registering the result.
if (R.getNode())
SetScalarizedVector(SDValue(N, ResNo), R);
SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
SDValue LHS = GetScalarizedVector(N->getOperand(0));
SDValue RHS = GetScalarizedVector(N->getOperand(1));
return DAG.getNode(N->getOpcode(), SDLoc(N),
LHS.getValueType(), LHS, RHS, N->getFlags());
SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
SDValue Op0 = GetScalarizedVector(N->getOperand(0));
SDValue Op1 = GetScalarizedVector(N->getOperand(1));
SDValue Op2 = GetScalarizedVector(N->getOperand(2));
return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
Op2, N->getFlags());
SDValue DAGTypeLegalizer::ScalarizeVecRes_FIX(SDNode *N) {
SDValue Op0 = GetScalarizedVector(N->getOperand(0));
SDValue Op1 = GetScalarizedVector(N->getOperand(1));
SDValue Op2 = N->getOperand(2);
return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
Op2, N->getFlags());
SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) {
EVT VT = N->getValueType(0).getVectorElementType();
unsigned NumOpers = N->getNumOperands();
SDValue Chain = N->getOperand(0);
EVT ValueVTs[] = {VT, MVT::Other};
SDLoc dl(N);
SmallVector<SDValue, 4> Opers(NumOpers);
// The Chain is the first operand.
Opers[0] = Chain;
// Now process the remaining operands.
for (unsigned i = 1; i < NumOpers; ++i) {
SDValue Oper = N->getOperand(i);
if (Oper.getValueType().isVector())
Oper = GetScalarizedVector(Oper);
Opers[i] = Oper;
SDValue Result = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(ValueVTs),
Opers, N->getFlags());
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
return Result;
SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
unsigned ResNo) {
SDLoc DL(N);
EVT ResVT = N->getValueType(0);
EVT OvVT = N->getValueType(1);
SDValue ScalarLHS, ScalarRHS;
if (getTypeAction(ResVT) == TargetLowering::TypeScalarizeVector) {
ScalarLHS = GetScalarizedVector(N->getOperand(0));
ScalarRHS = GetScalarizedVector(N->getOperand(1));
} else {
SmallVector<SDValue, 1> ElemsLHS, ElemsRHS;
DAG.ExtractVectorElements(N->getOperand(0), ElemsLHS);
DAG.ExtractVectorElements(N->getOperand(1), ElemsRHS);
ScalarLHS = ElemsLHS[0];
ScalarRHS = ElemsRHS[0];
SDVTList ScalarVTs = DAG.getVTList(
ResVT.getVectorElementType(), OvVT.getVectorElementType());
SDNode *ScalarNode = DAG.getNode(
N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
// Replace the other vector result not being explicitly scalarized here.
unsigned OtherNo = 1 - ResNo;
EVT OtherVT = N->getValueType(OtherNo);
if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) {
SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo));
} else {
SDValue OtherVal = DAG.getNode(
ISD::SCALAR_TO_VECTOR, DL, OtherVT, SDValue(ScalarNode, OtherNo));
ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
return SDValue(ScalarNode, ResNo);
SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
unsigned ResNo) {
SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
return GetScalarizedVector(Op);
SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
SDValue Op = N->getOperand(0);
if (Op.getValueType().isVector()
&& Op.getValueType().getVectorNumElements() == 1
&& !isSimpleLegalType(Op.getValueType()))
Op = GetScalarizedVector(Op);
EVT NewVT = N->getValueType(0).getVectorElementType();
return DAG.getNode(ISD::BITCAST, SDLoc(N),
NewVT, Op);
SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
EVT EltVT = N->getValueType(0).getVectorElementType();
SDValue InOp = N->getOperand(0);
// The BUILD_VECTOR operands may be of wider element types and
// we may need to truncate them back to the requested return type.
if (EltVT.isInteger())
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
return InOp;
SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
N->getOperand(0), N->getOperand(1));
SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
EVT NewVT = N->getValueType(0).getVectorElementType();
SDValue Op = GetScalarizedVector(N->getOperand(0));
return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
NewVT, Op, N->getOperand(1));
SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
SDValue Op = GetScalarizedVector(N->getOperand(0));
return DAG.getNode(ISD::FPOWI, SDLoc(N),
Op.getValueType(), Op, N->getOperand(1));
SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
// The value to insert may have a wider type than the vector element type,
// so be sure to truncate it to the element type if necessary.
SDValue Op = N->getOperand(1);
EVT EltVT = N->getValueType(0).getVectorElementType();
if (Op.getValueType() != EltVT)
// FIXME: Can this happen for floating point types?
Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Op);
return Op;
SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
assert(N->isUnindexed() && "Indexed vector load?");
SDValue Result = DAG.getLoad(
ISD::UNINDEXED, N->getExtensionType(),
N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(),
N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()),
N->getPointerInfo(), N->getMemoryVT().getVectorElementType(),
N->getOriginalAlign(), N->getMemOperand()->getFlags(), N->getAAInfo());
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
return Result;
SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
// Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
EVT DestVT = N->getValueType(0).getVectorElementType();
SDValue Op = N->getOperand(0);
EVT OpVT = Op.getValueType();
SDLoc DL(N);
// The result needs scalarizing, but it's not a given that the source does.
// This is a workaround for targets where it's impossible to scalarize the
// result of a conversion, because the source type is legal.
// For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
// are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
// legal and was not scalarized.
// See the similar logic in ScalarizeVecRes_SETCC
if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
Op = GetScalarizedVector(Op);
} else {
EVT VT = OpVT.getVectorElementType();
DAG.getVectorIdxConstant(0, DL));
return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op, N->getFlags());
SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
EVT EltVT = N->getValueType(0).getVectorElementType();
EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
SDValue LHS = GetScalarizedVector(N->getOperand(0));
return DAG.getNode(N->getOpcode(), SDLoc(N), EltVT,
LHS, DAG.getValueType(ExtVT));
SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
SDLoc DL(N);
SDValue Op = N->getOperand(0);
EVT OpVT = Op.getValueType();
EVT OpEltVT = OpVT.getVectorElementType();
EVT EltVT = N->getValueType(0).getVectorElementType();
if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
Op = GetScalarizedVector(Op);
} else {
DAG.getVectorIdxConstant(0, DL));
switch (N->getOpcode()) {
return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op);
return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op);
return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op);
llvm_unreachable("Illegal extend_vector_inreg opcode");
SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
// If the operand is wider than the vector element type then it is implicitly
// truncated. Make that explicit here.
EVT EltVT = N->getValueType(0).getVectorElementType();
SDValue InOp = N->getOperand(0);
if (InOp.getValueType() != EltVT)
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
return InOp;
SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
SDValue Cond = N->getOperand(0);
EVT OpVT = Cond.getValueType();
SDLoc DL(N);
// The vselect result and true/value operands needs scalarizing, but it's
// not a given that the Cond does. For instance, in AVX512 v1i1 is legal.
// See the similar logic in ScalarizeVecRes_SETCC
if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
Cond = GetScalarizedVector(Cond);
} else {
EVT VT = OpVT.getVectorElementType();
DAG.getVectorIdxConstant(0, DL));
SDValue LHS = GetScalarizedVector(N->getOperand(1));
TargetLowering::BooleanContent ScalarBool =
TLI.getBooleanContents(false, false);
TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false);
// If integer and float booleans have different contents then we can't
// reliably optimize in all cases. There is a full explanation for this in
// DAGCombiner::visitSELECT() where the same issue affects folding
// (select C, 0, 1) to (xor C, 1).
if (TLI.getBooleanContents(false, false) !=
TLI.getBooleanContents(false, true)) {
// At least try the common case where the boolean is generated by a
// comparison.
if (Cond->getOpcode() == ISD::SETCC) {
EVT OpVT = Cond->getOperand(0).getValueType();
ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());
VecBool = TLI.getBooleanContents(OpVT);
} else
ScalarBool = TargetLowering::UndefinedBooleanContent;
EVT CondVT = Cond.getValueType();
if (ScalarBool != VecBool) {
switch (ScalarBool) {
case TargetLowering::UndefinedBooleanContent:
case TargetLowering::ZeroOrOneBooleanContent:
assert(VecBool == TargetLowering::UndefinedBooleanContent ||
VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
// Vector read from all ones, scalar expects a single 1 so mask.
Cond = DAG.getNode(ISD::AND, SDLoc(N), CondVT,
Cond, DAG.getConstant(1, SDLoc(N), CondVT));
case TargetLowering::ZeroOrNegativeOneBooleanContent:
assert(VecBool == TargetLowering::UndefinedBooleanContent ||
VecBool == TargetLowering::ZeroOrOneBooleanContent);
// Vector reads from a one, scalar from all ones so sign extend.
Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), CondVT,
Cond, DAG.getValueType(MVT::i1));
// Truncate the condition if needed
auto BoolVT = getSetCCResultType(CondVT);
if (BoolVT.bitsLT(CondVT))
Cond = DAG.getNode(ISD::TRUNCATE, SDLoc(N), BoolVT, Cond);
return DAG.getSelect(SDLoc(N),
LHS.getValueType(), Cond, LHS,
SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
SDValue LHS = GetScalarizedVector(N->getOperand(1));
return DAG.getSelect(SDLoc(N),
LHS.getValueType(), N->getOperand(0), LHS,
SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
SDValue LHS = GetScalarizedVector(N->getOperand(2));
return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
N->getOperand(0), N->getOperand(1),
LHS, GetScalarizedVector(N->getOperand(3)),
SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
// Figure out if the scalar is the LHS or RHS and return it.
SDValue Arg = N->getOperand(2).getOperand(0);
if (Arg.isUndef())
return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
return GetScalarizedVector(N->getOperand(Op));
SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
assert(N->getValueType(0).isVector() &&
N->getOperand(0).getValueType().isVector() &&
"Operand types must be vectors");
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
EVT OpVT = LHS.getValueType();
EVT NVT = N->getValueType(0).getVectorElementType();
SDLoc DL(N);
// The result needs scalarizing, but it's not a given that the source does.
if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
LHS = GetScalarizedVector(LHS);
RHS = GetScalarizedVector(RHS);
} else {
EVT VT = OpVT.getVectorElementType();
DAG.getVectorIdxConstant(0, DL));
DAG.getVectorIdxConstant(0, DL));
// Turn it into a scalar SETCC.
SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
// Vectors may have a different boolean contents to scalars. Promote the
// value appropriately.
ISD::NodeType ExtendCode =
return DAG.getNode(ExtendCode, DL, NVT, Res);
// Operand Vector Scalarization <1 x ty> -> ty.
bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
LLVM_DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": "; N->dump(&DAG);
dbgs() << "\n");
SDValue Res = SDValue();
if (!Res.getNode()) {
switch (N->getOpcode()) {
#ifndef NDEBUG
dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
dbgs() << "\n";
report_fatal_error("Do not know how to scalarize this operator's "
Res = ScalarizeVecOp_BITCAST(N);
Res = ScalarizeVecOp_UnaryOp(N);
Res = ScalarizeVecOp_UnaryOp_StrictFP(N);
Res = ScalarizeVecOp_CONCAT_VECTORS(N);
Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
Res = ScalarizeVecOp_VSELECT(N);
case ISD::SETCC:
Res = ScalarizeVecOp_VSETCC(N);
case ISD::STORE:
Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
Res = ScalarizeVecOp_STRICT_FP_ROUND(N, OpNo);
Res = ScalarizeVecOp_FP_ROUND(N, OpNo);
Res = ScalarizeVecOp_VECREDUCE(N);
// If the result is null, the sub-method took care of registering results etc.
if (!Res.getNode()) return false;
// If the result is N, the sub-method updated N in place. Tell the legalizer
// core about this.
if (Res.getNode() == N)
return true;
assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
"Invalid operand expansion");
ReplaceValueWith(SDValue(N, 0), Res);
return false;
/// If the value to convert is a vector that needs to be scalarized, it must be
/// <1 x ty>. Convert the element instead.
SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
SDValue Elt = GetScalarizedVector(N->getOperand(0));
return DAG.getNode(ISD::BITCAST, SDLoc(N),
N->getValueType(0), Elt);
/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
/// Do the operation on the element instead.
SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
assert(N->getValueType(0).getVectorNumElements() == 1 &&
"Unexpected vector type!");
SDValue Elt = GetScalarizedVector(N->getOperand(0));
SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
N->getValueType(0).getScalarType(), Elt);
// Revectorize the result so the types line up with what the uses of this
// expression expect.
return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
/// Do the strict FP operation on the element instead.
SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N) {
assert(N->getValueType(0).getVectorNumElements() == 1 &&
"Unexpected vector type!");
SDValue Elt = GetScalarizedVector(N->getOperand(1));
SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
{ N->getValueType(0).getScalarType(), MVT::Other },
{ N->getOperand(0), Elt });
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
// Revectorize the result so the types line up with what the uses of this
// expression expect.
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
// Do our own replacement and return SDValue() to tell the caller that we
// handled all replacements since caller can only handle a single result.
ReplaceValueWith(SDValue(N, 0), Res);
return SDValue();
/// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
SmallVector<SDValue, 8> Ops(N->getNumOperands());
for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
Ops[i] = GetScalarizedVector(N->getOperand(i));
return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops);
/// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
/// so just return the element, ignoring the index.
SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
EVT VT = N->getValueType(0);
SDValue Res = GetScalarizedVector(N->getOperand(0));
if (Res.getValueType() != VT)
Res = VT.isFloatingPoint()
? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res)
: DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res);
return Res;
/// If the input condition is a vector that needs to be scalarized, it must be
/// <1 x i1>, so just convert to a normal ISD::SELECT
/// (still with vector output type since that was acceptable if we got here).
SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
SDValue ScalarCond = GetScalarizedVector(N->getOperand(0));
EVT VT = N->getValueType(0);
return DAG.getNode(ISD::SELECT, SDLoc(N), VT, ScalarCond, N->getOperand(1),
/// If the operand is a vector that needs to be scalarized then the
/// result must be v1i1, so just convert to a scalar SETCC and wrap
/// with a scalar_to_vector since the res type is legal if we got here
SDValue DAGTypeLegalizer::ScalarizeVecOp_VSETCC(SDNode *N) {
assert(N->getValueType(0).isVector() &&
N->getOperand(0).getValueType().isVector() &&
"Operand types must be vectors");
assert(N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type");
EVT VT = N->getValueType(0);
SDValue LHS = GetScalarizedVector(N->getOperand(0));
SDValue RHS = GetScalarizedVector(N->getOperand(1));
EVT OpVT = N->getOperand(0).getValueType();
EVT NVT = VT.getVectorElementType();
SDLoc DL(N);
// Turn it into a scalar SETCC.
SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
// Vectors may have a different boolean contents to scalars. Promote the
// value appropriately.
ISD::NodeType ExtendCode =
Res = DAG.getNode(ExtendCode, DL, NVT, Res);
return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Res);
/// If the value to store is a vector that needs to be scalarized, it must be
/// <1 x ty>. Just store the element.
SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
assert(N->isUnindexed() && "Indexed store of one-element vector?");
assert(OpNo == 1 && "Do not know how to scalarize this operand!");
SDLoc dl(N);
if (N->isTruncatingStore())
return DAG.getTruncStore(
N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
N->getBasePtr(), N->getPointerInfo(),
N->getMemoryVT().getVectorElementType(), N->getOriginalAlign(),
N->getMemOperand()->getFlags(), N->getAAInfo());
return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
N->getBasePtr(), N->getPointerInfo(),
N->getOriginalAlign(), N->getMemOperand()->getFlags(),
/// If the value to round is a vector that needs to be scalarized, it must be
/// <1 x ty>. Convert the element instead.
SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
SDValue Elt = GetScalarizedVector(N->getOperand(0));
SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N),
N->getValueType(0).getVectorElementType(), Elt,
return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
SDValue DAGTypeLegalizer::ScalarizeVecOp_STRICT_FP_ROUND(SDNode *N,
unsigned OpNo) {
assert(OpNo == 1 && "Wrong operand for scalarization!");
SDValue Elt = GetScalarizedVector(N->getOperand(1));
SDValue Res = DAG.getNode(ISD::STRICT_FP_ROUND, SDLoc(N),
{ N->getValueType(0).getVectorElementType(),
MVT::Other },
{ N->getOperand(0), Elt, N->getOperand(2) });
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
// Do our own replacement and return SDValue() to tell the caller that we
// handled all replacements since caller can only handle a single result.
ReplaceValueWith(SDValue(N, 0), Res);
return SDValue();
SDValue DAGTypeLegalizer::ScalarizeVecOp_VECREDUCE(SDNode *N) {
SDValue Res = GetScalarizedVector(N->getOperand(0));
// Result type may be wider than element type.
if (Res.getValueType() != N->getValueType(0))
Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Res);
return Res;
// Result Vector Splitting
/// This method is called when the specified result of the specified node is
/// found to need vector splitting. At this point, the node may also have
/// invalid operands or may have other results that need legalization, we just
/// know that (at least) one result needs vector splitting.
void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
LLVM_DEBUG(dbgs() << "Split node result: "; N->dump(&DAG); dbgs() << "\n");
SDValue Lo, Hi;
// See if the target wants to custom expand this node.
if (CustomLowerNode(N, N->getValueType(ResNo), true))
switch (N->getOpcode()) {
#ifndef NDEBUG
dbgs() << "SplitVectorResult #" << ResNo << ": ";
dbgs() << "\n";
report_fatal_error("Do not know how to split the result of this "
case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break;
case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break;
case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break;
case ISD::BITCAST: SplitVecRes_BITCAST(N, Lo, Hi); break;
case ISD::BUILD_VECTOR: SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
case ISD::CONCAT_VECTORS: SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break;
case ISD::FCOPYSIGN: SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
case ISD::SCALAR_TO_VECTOR: SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
case ISD::LOAD:
SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
case ISD::MLOAD:
SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
SplitVecRes_MGATHER(cast<MaskedGatherSDNode>(N), Lo, Hi);
case ISD::SETCC:
SplitVecRes_SETCC(N, Lo, Hi);
SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
case ISD::VAARG:
SplitVecRes_VAARG(N, Lo, Hi);
SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
case ISD::ABS:
case ISD::BSWAP:
case ISD::CTLZ:
case ISD::CTTZ:
case ISD::CTPOP:
case ISD::FABS:
case ISD::FCEIL:
case ISD::FCOS:
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FLOG:
case ISD::FLOG10:
case ISD::FLOG2:
case ISD::FNEG:
case ISD::FRINT:
case ISD::FSIN:
case ISD::FSQRT:
SplitVecRes_UnaryOp(N, Lo, Hi);
SplitVecRes_ExtendOp(N, Lo, Hi);
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
case ISD::MULHS:
case ISD::MULHU:
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::SDIV:
case ISD::UDIV:
case ISD::FDIV:
case ISD::FPOW:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::UREM:
case ISD::SREM:
case ISD::FREM:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
SplitVecRes_BinOp(N, Lo, Hi);
case ISD::FMA:
SplitVecRes_TernaryOp(N, Lo, Hi);
#include "llvm/IR/ConstrainedOps.def"
SplitVecRes_StrictFPOp(N, Lo, Hi);
case ISD::UADDO:
case ISD::SADDO:
case ISD::USUBO:
case ISD::SSUBO:
case ISD::UMULO:
case ISD::SMULO:
SplitVecRes_OverflowOp(N, ResNo, Lo, Hi);
SplitVecRes_FIX(N, Lo, Hi);
// If Lo/Hi is null, the sub-method took care of registering results etc.
if (Lo.getNode())
SetSplitVector(SDValue(N, ResNo), Lo, Hi);
void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
MachinePointerInfo &MPI,
SDValue &Ptr) {
SDLoc DL(N);
unsigned IncrementSize = MemVT.getSizeInBits().getKnownMinSize() / 8;
if (MemVT.isScalableVector()) {
SDValue BytesIncrement = DAG.getVScale(
DL, Ptr.getValueType(),
APInt(Ptr.getValueSizeInBits().getFixedSize(), IncrementSize));
MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement);
} else {
MPI = N->getPointerInfo().getWithOffset(IncrementSize);
// Increment the pointer to the other half.
Ptr = DAG.getObjectPtrOffset(DL, Ptr, IncrementSize);
void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
SDLoc dl(N);
const SDNodeFlags Flags = N->getFlags();
unsigned Opcode = N->getOpcode();
Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDValue Op0Lo, Op0Hi;
GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
SDValue Op1Lo, Op1Hi;
GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
SDValue Op2Lo, Op2Hi;
GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
SDLoc dl(N);
Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(), Op0Lo, Op1Lo,
Op2Lo, N->getFlags());
Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(), Op0Hi, Op1Hi,
Op2Hi, N->getFlags());
void DAGTypeLegalizer::SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi) {
GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
SDLoc dl(N);
SDValue Op2 = N->getOperand(2);
unsigned Opcode = N->getOpcode();
Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2,
Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2,
void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
SDValue &Hi) {
// We know the result is a vector. The input may be either a vector or a
// scalar value.
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
SDLoc dl(N);
SDValue InOp = N->getOperand(0);
EVT InVT = InOp.getValueType();
// Handle some special cases efficiently.
switch (getTypeAction(InVT)) {
case TargetLowering::TypeLegal:
case TargetLowering::TypePromoteInteger:
case TargetLowering::TypePromoteFloat:
case TargetLowering::TypeSoftPromoteHalf:
case TargetLowering::TypeSoftenFloat:
case TargetLowering::TypeScalarizeVector:
case TargetLowering::TypeWidenVector:
case TargetLowering::TypeExpandInteger:
case TargetLowering::TypeExpandFloat:
// A scalar to vector conversion, where the scalar needs expansion.
// If the vector is being split in two then we can just convert the
// expanded pieces.
if (LoVT == HiVT) {
GetExpandedOp(InOp, Lo, Hi);
if (DAG.getDataLayout().isBigEndian())
std::swap(Lo, Hi);
Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
case TargetLowering::TypeSplitVector:
// If the input is a vector that needs to be split, convert each split
// piece of the input now.
GetSplitVector(InOp, Lo, Hi);
Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
case TargetLowering::TypeScalarizeScalableVector:
report_fatal_error("Scalarization of scalable vectors is not supported.");
// In the general case, convert the input to an integer and split it by hand.
EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
if (DAG.getDataLayout().isBigEndian())
std::swap(LoIntVT, HiIntVT);
SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);
if (DAG.getDataLayout().isBigEndian())
std::swap(Lo, Hi);
Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc dl(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
unsigned LoNumElts = LoVT.getVectorNumElements();
SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
Lo = DAG.getBuildVector(LoVT, dl, LoOps);
SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
Hi = DAG.getBuildVector(HiVT, dl, HiOps);
void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
SDValue &Hi) {
assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
SDLoc dl(N);
unsigned NumSubvectors = N->getNumOperands() / 2;
if (NumSubvectors == 1) {
Lo = N->getOperand(0);
Hi = N->getOperand(1);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, LoOps);
SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, HiOps);
void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDValue Vec = N->getOperand(0);
SDValue Idx = N->getOperand(1);
SDLoc dl(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
Hi = DAG.getNode(
DAG.getVectorIdxConstant(IdxVal + LoVT.getVectorNumElements(), dl));
void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDValue Vec = N->getOperand(0);
SDValue SubVec = N->getOperand(1);
SDValue Idx = N->getOperand(2);
SDLoc dl(N);
GetSplitVector(Vec, Lo, Hi);
EVT VecVT = Vec.getValueType();
unsigned VecElems = VecVT.getVectorNumElements();
unsigned SubElems = SubVec.getValueType().getVectorNumElements();
// If we know the index is 0, and we know the subvector doesn't cross the
// boundary between the halves, we can avoid spilling the vector, and insert
// into the lower half of the split vector directly.
// TODO: The IdxVal == 0 constraint is artificial, we could do this whenever
// there is no boundary crossing. But those cases don't seem to get hit in
// practice.
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
// Spill the vector to the stack.
// In cases where the vector is illegal it will be broken down into parts
// and stored in parts - we should use the alignment for the smallest part.
Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
SDValue StackPtr =
DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
// Store the new subvector into the specified index.
SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
Store = DAG.getStore(Store, dl, SubVec, SubVecPtr,
// Load the Lo part from the stack slot.
Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo,
// Increment the pointer to the other part.
unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);
// Load the Hi part from the stack slot.
Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc dl(N);
GetSplitVector(N->getOperand(0), Lo, Hi);
Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo,
SDValue &Hi) {
GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
SDLoc DL(N);
SDValue RHS = N->getOperand(1);
EVT RHSVT = RHS.getValueType();
if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector)
GetSplitVector(RHS, RHSLo, RHSHi);
std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS));
Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo);
Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi);
void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
SDLoc dl(N);
std::tie(LoVT, HiVT) =
Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
unsigned Opcode = N->getOpcode();
SDValue N0 = N->getOperand(0);
SDLoc dl(N);
SDValue InLo, InHi;
if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(N0, InLo, InHi);
std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0);
EVT InLoVT = InLo.getValueType();
unsigned InNumElements = InLoVT.getVectorNumElements();
std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
unsigned OutNumElements = OutLoVT.getVectorNumElements();
assert((2 * OutNumElements) <= InNumElements &&
"Illegal extend vector in reg split");
// *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the
// input vector (i.e. we only use InLo):
// OutLo will extend the first OutNumElements from InLo.
// OutHi will extend the next OutNumElements from InLo.
// Shuffle the elements from InLo for OutHi into the bottom elements to
// create a 'fake' InHi.
SmallVector<int, 8> SplitHi(InNumElements, -1);
for (unsigned i = 0; i != OutNumElements; ++i)
SplitHi[i] = i + OutNumElements;
InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);
Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
void DAGTypeLegalizer::SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
unsigned NumOps = N->getNumOperands();
SDValue Chain = N->getOperand(0);
SDLoc dl(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
SmallVector<SDValue, 4> OpsLo(NumOps);
SmallVector<SDValue, 4> OpsHi(NumOps);
// The Chain is the first operand.
OpsLo[0] = Chain;
OpsHi[0] = Chain;
// Now process the remaining operands.
for (unsigned i = 1; i < NumOps; ++i) {
SDValue Op = N->getOperand(i);
SDValue OpLo = Op;
SDValue OpHi = Op;
EVT InVT = Op.getValueType();
if (InVT.isVector()) {
// If the input also splits, handle it directly for a
// compile time speedup. Otherwise split it by hand.
if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
GetSplitVector(Op, OpLo, OpHi);
std::tie(OpLo, OpHi) = DAG.SplitVectorOperand(N, i);
OpsLo[i] = OpLo;
OpsHi[i] = OpHi;
EVT LoValueVTs[] = {LoVT, MVT::Other};
EVT HiValueVTs[] = {HiVT, MVT::Other};
Lo = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(LoValueVTs), OpsLo,
Hi = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(HiValueVTs), OpsHi,
// Build a factor node to remember that this Op is independent of the
// other one.
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Lo.getValue(1), Hi.getValue(1));
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Chain);
SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
SDValue Chain = N->getOperand(0);
EVT VT = N->getValueType(0);
unsigned NE = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
SDLoc dl(N);
SmallVector<SDValue, 8> Scalars;
SmallVector<SDValue, 4> Operands(N->getNumOperands());
// If ResNE is 0, fully unroll the vector op.
if (ResNE == 0)
ResNE = NE;
else if (NE > ResNE)
NE = ResNE;
//The results of each unrolled operation, including the chain.
EVT ChainVTs[] = {EltVT, MVT::Other};
SmallVector<SDValue, 8> Chains;
unsigned i;
for (i = 0; i != NE; ++i) {
Operands[0] = Chain;
for (unsigned j = 1, e = N->getNumOperands(); j != e; ++j) {
SDValue Operand = N->getOperand(j);
EVT OperandVT = Operand.getValueType();
if (OperandVT.isVector()) {
EVT OperandEltVT = OperandVT.getVectorElementType();
Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT,
Operand, DAG.getVectorIdxConstant(i, dl));
} else {
Operands[j] = Operand;
SDValue Scalar = DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands);
//Add in the scalar as well as its chain value to the
//result vectors.
for (; i < ResNE; ++i)
// Build a new factor node to connect the chain back together.
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
ReplaceValueWith(SDValue(N, 1), Chain);
// Create a new BUILD_VECTOR node
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, ResNE);
return DAG.getBuildVector(VecVT, dl, Scalars);
void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
SDValue &Lo, SDValue &Hi) {
SDLoc dl(N);
EVT ResVT = N->getValueType(0);
EVT OvVT = N->getValueType(1);
std::tie(LoResVT, HiResVT) = DAG.GetSplitDestVTs(ResVT);
std::tie(LoOvVT, HiOvVT) = DAG.GetSplitDestVTs(OvVT);
if (getTypeAction(ResVT) == TargetLowering::TypeSplitVector) {
GetSplitVector(N->getOperand(0), LoLHS, HiLHS);
GetSplitVector(N->getOperand(1), LoRHS, HiRHS);
} else {
std::tie(LoLHS, HiLHS) = DAG.SplitVectorOperand(N, 0);
std::tie(LoRHS, HiRHS) = DAG.SplitVectorOperand(N, 1);
unsigned Opcode = N->getOpcode();
SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT);
SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
Lo = SDValue(LoNode, ResNo);
Hi = SDValue(HiNode, ResNo);
// Replace the other vector result not being explicitly split here.
unsigned OtherNo = 1 - ResNo;
EVT OtherVT = N->getValueType(OtherNo);
if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) {
SetSplitVector(SDValue(N, OtherNo),
SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
} else {
SDValue OtherVal = DAG.getNode(
SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDValue Vec = N->getOperand(0);
SDValue Elt = N->getOperand(1);
SDValue Idx = N->getOperand(2);
SDLoc dl(N);
GetSplitVector(Vec, Lo, Hi);
if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
unsigned IdxVal = CIdx->getZExtValue();
unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
if (IdxVal < LoNumElts)
Lo.getValueType(), Lo, Elt, Idx);
Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
// See if the target wants to custom expand this node.
if (CustomLowerNode(N, N->getValueType(0), true))
// Make the vector elements byte-addressable if they aren't already.
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
if (VecVT.getScalarSizeInBits() < 8) {
EltVT = MVT::i8;
VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
// Extend the element type to match if needed.
if (EltVT.bitsGT(Elt.getValueType()))
Elt = DAG.getNode(ISD::ANY_EXTEND, dl, EltVT, Elt);
// Spill the vector to the stack.
// In cases where the vector is illegal it will be broken down into parts
// and stored in parts - we should use the alignment for the smallest part.
Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
SDValue StackPtr =
DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
// Store the new element. This may be larger than the vector element type,
// so use a truncating store.
SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
Store = DAG.getTruncStore(
Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
// Load the Lo part from the stack slot.
Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
// Increment the pointer to the other part.
unsigned IncrementSize = LoVT.getSizeInBits() / 8;
StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);
// Load the Hi part from the stack slot.
Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
// If we adjusted the original type, we need to truncate the results.
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
if (LoVT != Lo.getValueType())
Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Lo);
if (HiVT != Hi.getValueType())
Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc dl(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
Hi = DAG.getUNDEF(HiVT);
void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
SDLoc dl(LD);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
ISD::LoadExtType ExtType = LD->getExtensionType();
SDValue Ch = LD->getChain();
SDValue Ptr = LD->getBasePtr();
SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
EVT MemoryVT = LD->getMemoryVT();
MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
AAMDNodes AAInfo = LD->getAAInfo();
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized()) {
SDValue Value, NewChain;
std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
std::tie(Lo, Hi) = DAG.SplitVector(Value, dl);
ReplaceValueWith(SDValue(LD, 1), NewChain);
Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
LD->getPointerInfo(), LoMemVT, LD->getOriginalAlign(),
MMOFlags, AAInfo);
MachinePointerInfo MPI;
IncrementPointer(LD, LoMemVT, MPI, Ptr);
Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset, MPI,
HiMemVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
// Build a factor node to remember that this load is independent of the
// other one.
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(LD, 1), Ch);
void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
SDValue &Lo, SDValue &Hi) {
assert(MLD->isUnindexed() && "Indexed masked load during type legalization!");
SDLoc dl(MLD);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
SDValue Ch = MLD->getChain();
SDValue Ptr = MLD->getBasePtr();
SDValue Offset = MLD->getOffset();
assert(Offset.isUndef() && "Unexpected indexed masked load offset");
SDValue Mask = MLD->getMask();
SDValue PassThru = MLD->getPassThru();
Align Alignment = MLD->getOriginalAlign();
ISD::LoadExtType ExtType = MLD->getExtensionType();
// Split Mask operand
SDValue MaskLo, MaskHi;
if (Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
EVT MemoryVT = MLD->getMemoryVT();
bool HiIsEmpty = false;
std::tie(LoMemVT, HiMemVT) =
DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty);
SDValue PassThruLo, PassThruHi;
if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(PassThru, PassThruLo, PassThruHi);
std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
Alignment, MLD->getAAInfo(), MLD->getRanges());
Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
MMO, MLD->getAddressingMode(), ExtType,
if (HiIsEmpty) {
// The hi masked load has zero storage size. We therefore simply set it to
// the low masked load and rely on subsequent removal from the chain.
Hi = Lo;
} else {
// Generate hi masked load.
Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
unsigned HiOffset = LoMemVT.getStoreSize();
MMO = DAG.getMachineFunction().getMachineMemOperand(
MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment,
MLD->getAAInfo(), MLD->getRanges());
Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi,
HiMemVT, MMO, MLD->getAddressingMode(), ExtType,
// Build a factor node to remember that this load is independent of the
// other one.
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(MLD, 1), Ch);
void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
SDValue &Lo, SDValue &Hi) {
SDLoc dl(MGT);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));
SDValue Ch = MGT->getChain();
SDValue Ptr = MGT->getBasePtr();
SDValue Mask = MGT->getMask();
SDValue PassThru = MGT->getPassThru();
SDValue Index = MGT->getIndex();
SDValue Scale = MGT->getScale();
Align Alignment = MGT->getOriginalAlign();
// Split Mask operand
SDValue MaskLo, MaskHi;
if (Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
SDValue PassThruLo, PassThruHi;
if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(PassThru, PassThruLo, PassThruHi);
std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
SDValue IndexHi, IndexLo;
if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Index, IndexLo, IndexHi);
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
MGT->getPointerInfo(), MachineMemOperand::MOLoad,
MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
MMO, MGT->getIndexType());
SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
MMO, MGT->getIndexType());
// Build a factor node to remember that this load is independent of the
// other one.
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(MGT, 1), Ch);
void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
assert(N->getValueType(0).isVector() &&
N->getOperand(0).getValueType().isVector() &&
"Operand types must be vectors");
SDLoc DL(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
// If the input also splits, handle it directly. Otherwise split it by hand.
SDValue LL, LH, RL, RH;
if (getTypeAction(N->getOperand(0).getValueType()) ==
GetSplitVector(N->getOperand(0), LL, LH);
std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
if (getTypeAction(N->getOperand(1).getValueType()) ==
GetSplitVector(N->getOperand(1), RL, RH);
std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
// Get the dest types - they may not match the input types, e.g. int_to_fp.
SDLoc dl(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
// If the input also splits, handle it directly for a compile time speedup.
// Otherwise split it by hand.
unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
EVT InVT = N->getOperand(OpNo).getValueType();
if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
GetSplitVector(N->getOperand(OpNo), Lo, Hi);
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, OpNo);
if (N->getOpcode() == ISD::FP_ROUND) {
Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1),
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1),
} else {
Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getFlags());
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getFlags());
void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc dl(N);
EVT SrcVT = N->getOperand(0).getValueType();
EVT DestVT = N->getValueType(0);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);
// We can do better than a generic split operation if the extend is doing
// more than just doubling the width of the elements and the following are
// true:
// - The number of vector elements is even,
// - the source type is legal,
// - the type of a split source is illegal,
// - the type of an extended (by doubling element size) source is legal, and
// - the type of that extended source when split is legal.
// This won't necessarily completely legalize the operation, but it will
// more effectively move in the right direction and prevent falling down
// to scalarization in many cases due to the input vector being split too
// far.
if ((SrcVT.getVectorMinNumElements() & 1) == 0 &&
SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
LLVMContext &Ctx = *DAG.getContext();
EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx);
EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx);
EVT SplitLoVT, SplitHiVT;
std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
LLVM_DEBUG(dbgs() << "Split vector extend via incremental extend:";
N->dump(&DAG); dbgs() << "\n");
// Extend the source vector by one step.
SDValue NewSrc =
DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
// Get the low and high halves of the new, extended one step, vector.
std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
// Extend those vector halves the rest of the way.
Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
// Fall back to the generic unary operator splitting otherwise.
SplitVecRes_UnaryOp(N, Lo, Hi);
void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
SDValue &Lo, SDValue &Hi) {
// The low and high parts of the original input give four input vectors.
SDValue Inputs[4];
SDLoc dl(N);
GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
EVT NewVT = Inputs[0].getValueType();
unsigned NewElts = NewVT.getVectorNumElements();
// If Lo or Hi uses elements from at most two of the four input vectors, then
// express it as a vector shuffle of those two inputs. Otherwise extract the
// input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
SmallVector<int, 16> Ops;
for (unsigned High = 0; High < 2; ++High) {
SDValue &Output = High ? Hi : Lo;
// Build a shuffle mask for the output, discovering on the fly which
// input vectors to use as shuffle operands (recorded in InputUsed).
// If building a suitable shuffle vector proves too hard, then bail
// out with useBuildVector set.
unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
unsigned FirstMaskIdx = High * NewElts;
bool useBuildVector = false;
for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
// The mask element. This indexes into the input.
int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
// The input vector this mask element indexes into.
unsigned Input = (unsigned)Idx / NewElts;
if (Input >= array_lengthof(Inputs)) {
// The mask element does not index into any input vector.
// Turn the index into an offset from the start of the input vector.
Idx -= Input * NewElts;
// Find or create a shuffle vector operand to hold this input.
unsigned OpNo;
for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
if (InputUsed[OpNo] == Input) {
// This input vector is already an operand.
} else if (InputUsed[OpNo] == -1U) {
// Create a new operand for this input vector.
InputUsed[OpNo] = Input;
if (OpNo >= array_lengthof(InputUsed)) {
// More than two input vectors used! Give up on trying to create a
// shuffle vector. Insert all elements into a BUILD_VECTOR instead.
useBuildVector = true;
// Add the mask index for the new shuffle vector.
Ops.push_back(Idx + OpNo * NewElts);
if (useBuildVector) {
EVT EltVT = NewVT.getVectorElementType();
SmallVector<SDValue, 16> SVOps;
// Extract the input elements by hand.
for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
// The mask element. This indexes into the input.
int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
// The input vector this mask element indexes into.
unsigned Input = (unsigned)Idx / NewElts;
if (Input >= array_lengthof(Inputs)) {
// The mask element is "undef" or indexes off the end of the input.
// Turn the index into an offset from the start of the input vector.
Idx -= Input * NewElts;
// Extract the vector element by hand.
SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
DAG.getVectorIdxConstant(Idx, dl)));
// Construct the Lo/Hi output using a BUILD_VECTOR.
Output = DAG.getBuildVector(NewVT, dl, SVOps);
} else if (InputUsed[0] == -1U) {
// No input vectors were used! The result is undefined.
Output = DAG.getUNDEF(NewVT);
} else {
SDValue Op0 = Inputs[InputUsed[0]];
// If only one input was used, use an undefined vector for the other.
SDValue Op1 = InputUsed[1] == -1U ?
DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
// At least one input vector was used. Create a new shuffle vector.
Output = DAG.getVectorShuffle(NewVT, dl, Op0, Op1, Ops);
void DAGTypeLegalizer::SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
EVT OVT = N->getValueType(0);
EVT NVT = OVT.getHalfNumVectorElementsVT(*DAG.getContext());
SDValue Chain = N->getOperand(0);
SDValue Ptr = N->getOperand(1);
SDValue SV = N->getOperand(2);
SDLoc dl(N);
const Align Alignment =
Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, SV, Alignment.value());
Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, SV, Alignment.value());
Chain = Hi.getValue(1);
// Modified the chain - switch anything that used the old chain to use
// the new one.
ReplaceValueWith(SDValue(N, 1), Chain);
// Operand Vector Splitting
/// This method is called when the specified operand of the specified node is
/// found to need vector splitting. At this point, all of the result types of
/// the node are known to be legal, but other operands of the node may need
/// legalization as well as the specified one.
bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
LLVM_DEBUG(dbgs() << "Split node operand: "; N->dump(&DAG); dbgs() << "\n");
SDValue Res = SDValue();
// See if the target wants to custom split this node.
if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
return false;
if (!Res.getNode()) {
switch (N->getOpcode()) {
#ifndef NDEBUG
dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
dbgs() << "\n";
report_fatal_error("Do not know how to split this operator's "
case ISD::SETCC: Res = SplitVecOp_VSETCC(N); break;
case ISD::BITCAST: Res = SplitVecOp_BITCAST(N); break;
case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break;
Res = SplitVecOp_TruncateHelper(N);
case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break;
case ISD::FCOPYSIGN: Res = SplitVecOp_FCOPYSIGN(N); break;
case ISD::STORE:
Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo);
Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo);
Res = SplitVecOp_VSELECT(N, OpNo);
if (N->getValueType(0).bitsLT(
N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType()))
Res = SplitVecOp_TruncateHelper(N);
Res = SplitVecOp_UnaryOp(N);
case ISD::CTTZ:
case ISD::CTLZ:
case ISD::CTPOP:
Res = SplitVecOp_UnaryOp(N);
Res = SplitVecOp_ExtVecInRegOp(N);
Res = SplitVecOp_VECREDUCE(N, OpNo);
// If the result is null, the sub-method took care of registering results etc.
if (!Res.getNode()) return false;
// If the result is N, the sub-method updated N in place. Tell the legalizer
// core about this.
if (Res.getNode() == N)
return true;
if (N->isStrictFPOpcode())
assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 2 &&
"Invalid operand expansion");
assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
"Invalid operand expansion");
ReplaceValueWith(SDValue(N, 0), Res);
return false;
SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
// The only possibility for an illegal operand is the mask, since result type
// legalization would have handled this node already otherwise.
assert(OpNo == 0 && "Illegal operand must be mask");
SDValue Mask = N->getOperand(0);
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
EVT Src0VT = Src0.getValueType();
SDLoc DL(N);
assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");
SDValue Lo, Hi;
GetSplitVector(N->getOperand(0), Lo, Hi);
assert(Lo.getValueType() == Hi.getValueType() &&
"Lo and Hi have differing types");
std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
assert(LoOpVT == HiOpVT && "Asymmetric vector split?");
SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
std::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
std::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
SDValue LoSelect =
DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
SDValue HiSelect =
DAG.getNode(ISD::VSELECT, DL, HiOpVT, HiMask, HiOp0, HiOp1);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
EVT ResVT = N->getValueType(0);
SDValue Lo, Hi;
SDLoc dl(N);
SDValue VecOp = N->getOperand(OpNo);
EVT VecVT = VecOp.getValueType();
assert(VecVT.isVector() && "Can only split reduce vector operand");
GetSplitVector(VecOp, Lo, Hi);
std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
bool NoNaN = N->getFlags().hasNoNaNs();
unsigned CombineOpc = 0;
switch (N->getOpcode()) {
case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break;
case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break;
case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break;
case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break;
case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break;
case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
llvm_unreachable("Unexpected reduce ISD node");
// Use the appropriate scalar instruction on the split subvectors before
// reducing the now partially reduced smaller vector.
SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi, N->getFlags());
return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, N->getFlags());
SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
// The result has a legal vector type, but the input needs splitting.
EVT ResVT = N->getValueType(0);
SDValue Lo, Hi;
SDLoc dl(N);
GetSplitVector(N->getOperand(N->isStrictFPOpcode() ? 1 : 0), Lo, Hi);
EVT InVT = Lo.getValueType();
EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
- InVT.getVectorNumElements());
+ InVT.getVectorElementCount());
if (N->isStrictFPOpcode()) {
Lo = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other },
{ N->getOperand(0), Lo });
Hi = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other },
{ N->getOperand(0), Hi });
// Build a factor node to remember that this operation is independent
// of the other one.
SDValue Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Ch);
} else {
Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
// For example, i64 = BITCAST v4i16 on alpha. Typically the vector will
// end up being split all the way down to individual components. Convert the
// split pieces into integers and reassemble.
SDValue Lo, Hi;
GetSplitVector(N->getOperand(0), Lo, Hi);
Lo = BitConvertToInteger(Lo);
Hi = BitConvertToInteger(Hi);
if (DAG.getDataLayout().isBigEndian())
std::swap(Lo, Hi);
return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
JoinIntegers(Lo, Hi));
SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
// We know that the extracted result type is legal.
EVT SubVT = N->getValueType(0);
SDValue Idx = N->getOperand(1);
SDLoc dl(N);
SDValue Lo, Hi;
+ if (SubVT.isScalableVector() !=
+ N->getOperand(0).getValueType().isScalableVector())
+ report_fatal_error("Extracting a fixed-length vector from an illegal "
+ "scalable vector is not yet supported");
GetSplitVector(N->getOperand(0), Lo, Hi);
- uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+ uint64_t LoElts = Lo.getValueType().getVectorMinNumElements();
uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal < LoElts) {
- assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
+ assert(IdxVal + SubVT.getVectorMinNumElements() <= LoElts &&
"Extracted subvector crosses vector split!");
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
} else {
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
DAG.getVectorIdxConstant(IdxVal - LoElts, dl));
SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
SDValue Vec = N->getOperand(0);
SDValue Idx = N->getOperand(1);
EVT VecVT = Vec.getValueType();
if (isa<ConstantSDNode>(Idx)) {
uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
SDValue Lo, Hi;
GetSplitVector(Vec, Lo, Hi);
uint64_t LoElts = Lo.getValueType().getVectorNumElements();
if (IdxVal < LoElts)
return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
return SDValue(DAG.UpdateNodeOperands(N, Hi,
DAG.getConstant(IdxVal - LoElts, SDLoc(N),
Idx.getValueType())), 0);
// See if the target wants to custom expand this node.
if (CustomLowerNode(N, N->getValueType(0), true))
return SDValue();
// Make the vector elements byte-addressable if they aren't already.
SDLoc dl(N);
EVT EltVT = VecVT.getVectorElementType();
if (VecVT.getScalarSizeInBits() < 8) {
EltVT = MVT::i8;
VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
// Store the vector to the stack.
// In cases where the vector is illegal it will be broken down into parts
// and stored in parts - we should use the alignment for the smallest part.
Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
SDValue StackPtr =
DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
auto &MF = DAG.getMachineFunction();
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
// Load back the required element.
StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
// FIXME: This is to handle i1 vectors with elements promoted to i8.
// i1 vector handling needs general improvement.
if (N->getValueType(0).bitsLT(EltVT)) {
SDValue Load = DAG.getLoad(EltVT, dl, Store, StackPtr,
return DAG.getZExtOrTrunc(Load, dl, N->getValueType(0));
return DAG.getExtLoad(
ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT,
commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
SDValue Lo, Hi;
// *_EXTEND_VECTOR_INREG only reference the lower half of the input, so
// splitting the result has the same effect as splitting the input operand.
SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
unsigned OpNo) {
SDLoc dl(MGT);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));
SDValue Ch = MGT->getChain();
SDValue Ptr = MGT->getBasePtr();
SDValue Index = MGT->getIndex();
SDValue Scale = MGT->getScale();
SDValue Mask = MGT->getMask();
SDValue PassThru = MGT->getPassThru();
Align Alignment = MGT->getOriginalAlign();
SDValue MaskLo, MaskHi;
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
// Split Mask operand
GetSplitVector(Mask, MaskLo, MaskHi);
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
EVT MemoryVT = MGT->getMemoryVT();
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
SDValue PassThruLo, PassThruHi;
if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(PassThru, PassThruLo, PassThruHi);
std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
SDValue IndexHi, IndexLo;
if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Index, IndexLo, IndexHi);
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
MGT->getPointerInfo(), MachineMemOperand::MOLoad,
MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
OpsLo, MMO, MGT->getIndexType());
SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
OpsHi, MMO, MGT->getIndexType());
// Build a factor node to remember that this load is independent of the
// other one.
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(MGT, 1), Ch);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MGT->getValueType(0), Lo,
ReplaceValueWith(SDValue(MGT, 0), Res);
return SDValue();
SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
unsigned OpNo) {
assert(N->isUnindexed() && "Indexed masked store of vector?");
SDValue Ch = N->getChain();
SDValue Ptr = N->getBasePtr();
SDValue Offset = N->getOffset();
assert(Offset.isUndef() && "Unexpected indexed masked store offset");
SDValue Mask = N->getMask();
SDValue Data = N->getValue();
Align Alignment = N->getOriginalAlign();
SDLoc DL(N);
SDValue DataLo, DataHi;
if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
// Split Data operand
GetSplitVector(Data, DataLo, DataHi);
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
// Split Mask operand
SDValue MaskLo, MaskHi;
if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
EVT MemoryVT = N->getMemoryVT();
bool HiIsEmpty = false;
std::tie(LoMemVT, HiMemVT) =
DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty);
SDValue Lo, Hi, Res;
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
N->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
Alignment, N->getAAInfo(), N->getRanges());
Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
N->getAddressingMode(), N->isTruncatingStore(),
if (HiIsEmpty) {
// The hi masked store has zero storage size.
// Only the lo masked store is needed.
Res = Lo;
} else {
Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
unsigned HiOffset = LoMemVT.getStoreSize();
MMO = DAG.getMachineFunction().getMachineMemOperand(
N->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOStore,
HiMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges());
Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
N->getAddressingMode(), N->isTruncatingStore(),
// Build a factor node to remember that this store is independent of the
// other one.
Res = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
return Res;
SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
unsigned OpNo) {
SDValue Ch = N->getChain();
SDValue Ptr = N->getBasePtr();
SDValue Mask = N->getMask();
SDValue Index = N->getIndex();
SDValue Scale = N->getScale();
SDValue Data = N->getValue();
Align Alignment = N->getOriginalAlign();
SDLoc DL(N);
// Split all operands
SDValue DataLo, DataHi;
if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
// Split Data operand
GetSplitVector(Data, DataLo, DataHi);
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
// Split Mask operand
SDValue MaskLo, MaskHi;
if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
SDValue IndexHi, IndexLo;
if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Index, IndexLo, IndexHi);
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
SDValue Lo;
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
N->getPointerInfo(), MachineMemOperand::MOStore,
MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
DL, OpsLo, MMO, N->getIndexType());
// The order of the Scatter operation after split is well defined. The "Hi"
// part comes after the "Lo". So these two operations should be chained one
// after another.
SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
DL, OpsHi, MMO, N->getIndexType());
SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
assert(N->isUnindexed() && "Indexed store of vector?");
assert(OpNo == 1 && "Can only split the stored value");
SDLoc DL(N);
bool isTruncating = N->isTruncatingStore();
SDValue Ch = N->getChain();
SDValue Ptr = N->getBasePtr();
EVT MemoryVT = N->getMemoryVT();
Align Alignment = N->getOriginalAlign();
MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
AAMDNodes AAInfo = N->getAAInfo();
SDValue Lo, Hi;
GetSplitVector(N->getOperand(1), Lo, Hi);
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
// Scalarize if the split halves are not byte-sized.
if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized())
return TLI.scalarizeVectorStore(N, DAG);
if (isTruncating)
Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT,
Alignment, MMOFlags, AAInfo);
Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
MachinePointerInfo MPI;
IncrementPointer(N, LoMemVT, MPI, Ptr);
if (isTruncating)
Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr, MPI,
HiMemVT, Alignment, MMOFlags, AAInfo);
Hi = DAG.getStore(Ch, DL, Hi, Ptr, MPI, Alignment, MMOFlags, AAInfo);
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
SDLoc DL(N);
// The input operands all must have the same type, and we know the result
// type is valid. Convert this to a buildvector which extracts all the
// input elements.
// TODO: If the input elements are power-two vectors, we could convert this to
// a new CONCAT_VECTORS node with elements that are half-wide.
SmallVector<SDValue, 32> Elts;
EVT EltVT = N->getValueType(0).getVectorElementType();
for (const SDValue &Op : N->op_values()) {
for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
i != e; ++i) {
Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
DAG.getVectorIdxConstant(i, DL)));
return DAG.getBuildVector(N->getValueType(0), DL, Elts);
SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
// The result type is legal, but the input type is illegal. If splitting
// ends up with the result type of each half still being legal, just
// do that. If, however, that would result in an illegal result type,
// we can try to get more clever with power-two vectors. Specifically,
// split the input type, but also widen the result element size, then
// concatenate the halves and truncate again. For example, consider a target
// where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
// vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
// %inlo = v4i32 extract_subvector %in, 0
// %inhi = v4i32 extract_subvector %in, 4
// %lo16 = v4i16 trunc v4i32 %inlo
// %hi16 = v4i16 trunc v4i32 %inhi
// %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
// %res = v8i8 trunc v8i16 %in16
// Without this transform, the original truncate would end up being
// scalarized, which is pretty much always a last resort.
unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
SDValue InVec = N->getOperand(OpNo);
EVT InVT = InVec->getValueType(0);
EVT OutVT = N->getValueType(0);
- unsigned NumElements = OutVT.getVectorNumElements();
+ ElementCount NumElements = OutVT.getVectorElementCount();
bool IsFloat = OutVT.isFloatingPoint();
- // Widening should have already made sure this is a power-two vector
- // if we're trying to split it at all. assert() that's true, just in case.
- assert(!(NumElements & 1) && "Splitting vector, but not in half!");
unsigned InElementSize = InVT.getScalarSizeInBits();
unsigned OutElementSize = OutVT.getScalarSizeInBits();
// Determine the split output VT. If its legal we can just split dirctly.
std::tie(LoOutVT, HiOutVT) = DAG.GetSplitDestVTs(OutVT);
assert(LoOutVT == HiOutVT && "Unequal split?");
// If the input elements are only 1/2 the width of the result elements,
// just use the normal splitting. Our trick only work if there's room
// to split more than once.
if (isTypeLegal(LoOutVT) ||
InElementSize <= OutElementSize * 2)
return SplitVecOp_UnaryOp(N);
SDLoc DL(N);
// Don't touch if this will be scalarized.
EVT FinalVT = InVT;
while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
FinalVT = FinalVT.getHalfNumVectorElementsVT(*DAG.getContext());
if (getTypeAction(FinalVT) == TargetLowering::TypeScalarizeVector)
return SplitVecOp_UnaryOp(N);
// Get the split input vector.
SDValue InLoVec, InHiVec;
GetSplitVector(InVec, InLoVec, InHiVec);
// Truncate them to 1/2 the element size.
+ //
+ // This assumes the number of elements is a power of two; any vector that
+ // isn't should be widened, not split.
EVT HalfElementVT = IsFloat ?
EVT::getFloatingPointVT(InElementSize/2) :
EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
SDValue HalfLo;
SDValue HalfHi;
SDValue Chain;
if (N->isStrictFPOpcode()) {
HalfLo = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other},
{N->getOperand(0), InLoVec});
HalfHi = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other},
{N->getOperand(0), InHiVec});
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, HalfLo.getValue(1),
} else {
HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec);
HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec);
// Concatenate them to get the full intermediate truncation result.
EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
// Now finish up by truncating all the way down to the original result
// type. This should normally be something that ends up being legal directly,
// but in theory if a target has very wide vectors and an annoyingly
// restricted set of legal types, this split can chain to build things up.
if (N->isStrictFPOpcode()) {
SDValue Res = DAG.getNode(
{Chain, InterVec,
DAG.getTargetConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()))});
// Relink the chain
ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1));
return Res;
return IsFloat
? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec,
0, DL, TLI.getPointerTy(DAG.getDataLayout())))
: DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
assert(N->getValueType(0).isVector() &&
N->getOperand(0).getValueType().isVector() &&
"Operand types must be vectors");
// The result has a legal vector type, but the input needs splitting.
SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
SDLoc DL(N);
GetSplitVector(N->getOperand(0), Lo0, Hi0);
GetSplitVector(N->getOperand(1), Lo1, Hi1);
auto PartEltCnt = Lo0.getValueType().getVectorElementCount();
LLVMContext &Context = *DAG.getContext();
EVT PartResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt);
EVT WideResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt*2);
LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
EVT OpVT = N->getOperand(0).getValueType();
ISD::NodeType ExtendCode =
return DAG.getNode(ExtendCode, DL, N->getValueType(0), Con);
SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
// The result has a legal vector type, but the input needs splitting.
EVT ResVT = N->getValueType(0);
SDValue Lo, Hi;
SDLoc DL(N);
GetSplitVector(N->getOperand(N->isStrictFPOpcode() ? 1 : 0), Lo, Hi);
EVT InVT = Lo.getValueType();
EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
if (N->isStrictFPOpcode()) {
Lo = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other },
{ N->getOperand(0), Lo, N->getOperand(2) });
Hi = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other },
{ N->getOperand(0), Hi, N->getOperand(2) });
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
Lo.getValue(1), Hi.getValue(1));
ReplaceValueWith(SDValue(N, 1), NewChain);
} else {
Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) {
// The result (and the first input) has a legal vector type, but the second
// input needs splitting.
return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements());
// Result Vector Widening
void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
LLVM_DEBUG(dbgs() << "Widen node result " << ResNo << ": "; N->dump(&DAG);
dbgs() << "\n");
// See if the target wants to custom widen this node.
if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
SDValue Res = SDValue();
switch (N->getOpcode()) {
#ifndef NDEBUG
dbgs() << "WidenVectorResult #" << ResNo << ": ";
dbgs() << "\n";
llvm_unreachable("Do not know how to widen the result of this operator!");
case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
case ISD::BITCAST: Res = WidenVecRes_BITCAST(N); break;
case ISD::BUILD_VECTOR: Res = WidenVecRes_BUILD_VECTOR(N); break;
case ISD::CONCAT_VECTORS: Res = WidenVecRes_CONCAT_VECTORS(N); break;
case ISD::LOAD: Res = WidenVecRes_LOAD(N); break;
case ISD::SCALAR_TO_VECTOR: Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
case ISD::SELECT: Res = WidenVecRes_SELECT(N); break;
case ISD::SELECT_CC: Res = WidenVecRes_SELECT_CC(N); break;
case ISD::SETCC: Res = WidenVecRes_SETCC(N); break;
case ISD::UNDEF: Res = WidenVecRes_UNDEF(N); break;
Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
case ISD::MLOAD:
Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
case ISD::ADD:
case ISD::AND:
case ISD::MUL:
case ISD::MULHS:
case ISD::MULHU:
case ISD::OR:
case ISD::SUB:
case ISD::XOR:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
Res = WidenVecRes_Binary(N);
case ISD::FADD:
case ISD::FMUL:
case ISD::FPOW:
case ISD::FSUB:
case ISD::FDIV:
case ISD::FREM:
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM:
Res = WidenVecRes_BinaryCanTrap(N);
// These are binary operations, but with an extra operand that shouldn't
// be widened (the scale).
Res = WidenVecRes_BinaryWithExtraScalarOp(N);
#include "llvm/IR/ConstrainedOps.def"
Res = WidenVecRes_StrictFP(N);
case ISD::UADDO:
case ISD::SADDO:
case ISD::USUBO:
case ISD::SSUBO:
case ISD::UMULO:
case ISD::SMULO:
Res = WidenVecRes_OverflowOp(N, ResNo);
Res = WidenVecRes_FCOPYSIGN(N);
case ISD::FPOWI:
Res = WidenVecRes_POWI(N);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
Res = WidenVecRes_Shift(N);
Res = WidenVecRes_Convert(N);
case ISD::FABS:
case ISD::FCEIL:
case ISD::FCOS:
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FLOG:
case ISD::FLOG10:
case ISD::FLOG2:
case ISD::FRINT:
case ISD::FSIN:
case ISD::FSQRT:
case ISD::FTRUNC: {
// We're going to widen this vector op to a legal type by padding with undef
// elements. If the wide vector op is eventually going to be expanded to
// scalar libcalls, then unroll into scalar ops now to avoid unnecessary
// libcalls on the undef elements.
EVT VT = N->getValueType(0);
EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
if (!TLI.isOperationLegalOrCustom(N->getOpcode(), WideVecVT) &&
TLI.isOperationExpand(N->getOpcode(), VT.getScalarType())) {
Res = DAG.UnrollVectorOp(N, WideVecVT.getVectorNumElements());
// If the target has custom/legal support for the scalar FP intrinsic ops
// (they are probably not destined to become libcalls), then widen those like
// any other unary ops.
case ISD::ABS:
case ISD::BSWAP:
case ISD::CTLZ:
case ISD::CTPOP:
case ISD::CTTZ:
case ISD::FNEG:
Res = WidenVecRes_Unary(N);
case ISD::FMA:
Res = WidenVecRes_Ternary(N);
// If Res is null, the sub-method took care of registering the result.
if (Res.getNode())
SetWidenedVector(SDValue(N, ResNo), Res);
SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
// Ternary op widening.
SDLoc dl(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue InOp1 = GetWidenedVector(N->getOperand(0));
SDValue InOp2 = GetWidenedVector(N->getOperand(1));
SDValue InOp3 = GetWidenedVector(N->getOperand(2));
return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
// Binary op widening.
SDLoc dl(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue InOp1 = GetWidenedVector(N->getOperand(0));
SDValue InOp2 = GetWidenedVector(N->getOperand(1));
return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
SDValue DAGTypeLegalizer::WidenVecRes_BinaryWithExtraScalarOp(SDNode *N) {
// Binary op widening, but with an extra operand that shouldn't be widened.
SDLoc dl(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue InOp1 = GetWidenedVector(N->getOperand(0));
SDValue InOp2 = GetWidenedVector(N->getOperand(1));
SDValue InOp3 = N->getOperand(2);
return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3,
// Given a vector of operations that have been broken up to widen, see
// if we can collect them together into the next widest legal VT. This
// implementation is trap-safe.
static SDValue CollectOpsToWiden(SelectionDAG &DAG, const TargetLowering &TLI,
SmallVectorImpl<SDValue> &ConcatOps,
unsigned ConcatEnd, EVT VT, EVT MaxVT,
EVT WidenVT) {
// Check to see if we have a single operation with the widen type.
if (ConcatEnd == 1) {
VT = ConcatOps[0].getValueType();
if (VT == WidenVT)
return ConcatOps[0];
SDLoc dl(ConcatOps[0]);
EVT WidenEltVT = WidenVT.getVectorElementType();
// while (Some element of ConcatOps is not of type MaxVT) {
// From the end of ConcatOps, collect elements of the same type and put
// them into an op of the next larger supported type
// }
while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
int Idx = ConcatEnd - 1;
VT = ConcatOps[Idx--].getValueType();
while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
do {
NextSize *= 2;
NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
} while (!TLI.isTypeLegal(NextVT));
if (!VT.isVector()) {
// Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
SDValue VecOp = DAG.getUNDEF(NextVT);
unsigned NumToInsert = ConcatEnd - Idx - 1;
for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
ConcatOps[OpIdx], DAG.getVectorIdxConstant(i, dl));
ConcatOps[Idx+1] = VecOp;
ConcatEnd = Idx + 2;
} else {
// Vector type, create a CONCAT_VECTORS of type NextVT
SDValue undefVec = DAG.getUNDEF(VT);
unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
unsigned RealVals = ConcatEnd - Idx - 1;
unsigned SubConcatEnd = 0;
unsigned SubConcatIdx = Idx + 1;
while (SubConcatEnd < RealVals)
SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
while (SubConcatEnd < OpsToConcat)
SubConcatOps[SubConcatEnd++] = undefVec;
ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
NextVT, SubConcatOps);
ConcatEnd = SubConcatIdx + 1;
// Check to see if we have a single operation with the widen type.
if (ConcatEnd == 1) {
VT = ConcatOps[0].getValueType();
if (VT == WidenVT)
return ConcatOps[0];
// add undefs of size MaxVT until ConcatOps grows to length of WidenVT
unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
if (NumOps != ConcatEnd ) {
SDValue UndefVal = DAG.getUNDEF(MaxVT);
for (unsigned j = ConcatEnd; j < NumOps; ++j)
ConcatOps[j] = UndefVal;
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
makeArrayRef(, NumOps));
SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
// Binary op widening for operations that can trap.
unsigned Opcode = N->getOpcode();
SDLoc dl(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
EVT WidenEltVT = WidenVT.getVectorElementType();
EVT VT = WidenVT;
unsigned NumElts = VT.getVectorNumElements();
const SDNodeFlags Flags = N->getFlags();
while (!TLI.isTypeLegal(VT) && NumElts != 1) {
NumElts = NumElts / 2;
VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
// Operation doesn't trap so just widen as normal.
SDValue InOp1 = GetWidenedVector(N->getOperand(0));
SDValue InOp2 = GetWidenedVector(N->getOperand(1));
return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags);
// No legal vector version so unroll the vector operation and then widen.
if (NumElts == 1)
return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
// Since the operation can trap, apply operation on the original vector.
SDValue InOp1 = GetWidenedVector(N->getOperand(0));
SDValue InOp2 = GetWidenedVector(N->getOperand(1));
unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
SmallVector<SDValue, 16> ConcatOps(CurNumElts);
unsigned ConcatEnd = 0; // Current ConcatOps index.
int Idx = 0; // Current Idx into input vectors.
// NumElts := greatest legal vector size (at most WidenVT)
// while (orig. vector has unhandled elements) {
// take munches of size NumElts from the beginning and add to ConcatOps
// NumElts := next smaller supported vector size or 1
// }
while (CurNumElts != 0) {
while (CurNumElts >= NumElts) {
SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
DAG.getVectorIdxConstant(Idx, dl));
SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
DAG.getVectorIdxConstant(Idx, dl));
ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags);
Idx += NumElts;
CurNumElts -= NumElts;
do {
NumElts = NumElts / 2;
VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
} while (!TLI.isTypeLegal(VT) && NumElts != 1);
if (NumElts == 1) {
for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
InOp1, DAG.getVectorIdxConstant(Idx, dl));
SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
InOp2, DAG.getVectorIdxConstant(Idx, dl));
ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
EOp1, EOp2, Flags);
CurNumElts = 0;
return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);
SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) {
switch (N->getOpcode()) {
return WidenVecRes_STRICT_FSETCC(N);
return WidenVecRes_Convert_StrictFP(N);
// StrictFP op widening for operations that can trap.
unsigned NumOpers = N->getNumOperands();
unsigned Opcode = N->getOpcode();
SDLoc dl(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
EVT WidenEltVT = WidenVT.getVectorElementType();
EVT VT = WidenVT;
unsigned NumElts = VT.getVectorNumElements();
while (!TLI.isTypeLegal(VT) && NumElts != 1) {
NumElts = NumElts / 2;
VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
// No legal vector version so unroll the vector operation and then widen.
if (NumElts == 1)
return UnrollVectorOp_StrictFP(N, WidenVT.getVectorNumElements());
// Since the operation can trap, apply operation on the original vector.
SmallVector<SDValue, 4> InOps;
unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
SmallVector<SDValue, 16> ConcatOps(CurNumElts);
SmallVector<SDValue, 16> Chains;
unsigned ConcatEnd = 0; // Current ConcatOps index.
int Idx = 0; // Current Idx into input vectors.
// The Chain is the first operand.
// Now process the remaining operands.
for (unsigned i = 1; i < NumOpers; ++i) {
SDValue Oper = N->getOperand(i);
if (Oper.getValueType().isVector()) {
assert(Oper.getValueType() == N->getValueType(0) &&
"Invalid operand type to widen!");
Oper = GetWidenedVector(Oper);
// NumElts := greatest legal vector size (at most WidenVT)
// while (orig. vector has unhandled elements) {
// take munches of size NumElts from the beginning and add to ConcatOps
// NumElts := next smaller supported vector size or 1
// }
while (CurNumElts != 0) {
while (CurNumElts >= NumElts) {
SmallVector<SDValue, 4> EOps;
for (unsigned i = 0; i < NumOpers; ++i) {
SDValue Op = InOps[i];
if (Op.getValueType().isVector())
DAG.getVectorIdxConstant(Idx, dl));
EVT OperVT[] = {VT, MVT::Other};
SDValue Oper = DAG.getNode(Opcode, dl, OperVT, EOps);
ConcatOps[ConcatEnd++] = Oper;
Idx += NumElts;
CurNumElts -= NumElts;
do {
NumElts = NumElts / 2;
VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
} while (!TLI.isTypeLegal(VT) && NumElts != 1);
if (NumElts == 1) {
for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
SmallVector<SDValue, 4> EOps;
for (unsigned i = 0; i < NumOpers; ++i) {
SDValue Op = InOps[i];
if (Op.getValueType().isVector())
Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, Op,
DAG.getVectorIdxConstant(Idx, dl));
EVT WidenVT[] = {WidenEltVT, MVT::Other};
SDValue Oper = DAG.getNode(Opcode, dl, WidenVT, EOps);
ConcatOps[ConcatEnd++] = Oper;
CurNumElts = 0;
// Build a factor node to remember all the Ops that have been created.
SDValue NewChain;
if (Chains.size() == 1)
NewChain = Chains[0];
NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
ReplaceValueWith(SDValue(N, 1), NewChain);
return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);
SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) {
SDLoc DL(N);
EVT ResVT = N->getValueType(0);
EVT OvVT = N->getValueType(1);
EVT WideResVT, WideOvVT;
SDValue WideLHS, WideRHS;
// TODO: This might result in a widen/split loop.
if (ResNo == 0) {
WideResVT = TLI.getTypeToTransformTo(*DAG.getContext(), ResVT);
WideOvVT = EVT::getVectorVT(
*DAG.getContext(), OvVT.getVectorElementType(),
WideLHS = GetWidenedVector(N->getOperand(0));
WideRHS = GetWidenedVector(N->getOperand(1));
} else {
WideOvVT = TLI.getTypeToTransformTo(*DAG.getContext(), OvVT);
WideResVT = EVT::getVectorVT(
*DAG.getContext(), ResVT.getVectorElementType(),
SDValue Zero = DAG.getVectorIdxConstant(0, DL);
WideLHS = DAG.getNode(
N->getOperand(0), Zero);
WideRHS = DAG.getNode(
N->getOperand(1), Zero);
SDVTList WideVTs = DAG.getVTList(WideResVT, WideOvVT);
SDNode *WideNode = DAG.getNode(
N->getOpcode(), DL, WideVTs, WideLHS, WideRHS).getNode();
// Replace the other vector result not being explicitly widened here.
unsigned OtherNo = 1 - ResNo;
EVT OtherVT = N->getValueType(OtherNo);
if (getTypeAction(OtherVT) == TargetLowering::TypeWidenVector) {
SetWidenedVector(SDValue(N, OtherNo), SDValue(WideNode, OtherNo));
} else {
SDValue Zero = DAG.getVectorIdxConstant(0, DL);
SDValue OtherVal = DAG.getNode(
ISD::EXTRACT_SUBVECTOR, DL, OtherVT, SDValue(WideNode, OtherNo), Zero);
ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
return SDValue(WideNode, ResNo);
SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
SDValue InOp = N->getOperand(0);
SDLoc DL(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
unsigned WidenNumElts = WidenVT.getVectorNumElements();
EVT InVT = InOp.getValueType();
EVT InEltVT = InVT.getVectorElementType();
EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
unsigned Opcode = N->getOpcode();
unsigned InVTNumElts = InVT.getVectorNumElements();
const SDNodeFlags Flags = N->getFlags();
if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
InOp = GetWidenedVector(N->getOperand(0));
InVT = InOp.getValueType();
InVTNumElts = InVT.getVectorNumElements();
if (InVTNumElts == WidenNumElts) {
if (N->getNumOperands() == 1)
return DAG.getNode(Opcode, DL, WidenVT, InOp);
return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) {
// If both input and result vector types are of same width, extend
// operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
// accepts fewer elements in the result than in the input.
if (Opcode == ISD::ANY_EXTEND)
return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
if (Opcode == ISD::SIGN_EXTEND)
if (Opcode == ISD::ZERO_EXTEND)
if (TLI.isTypeLegal(InWidenVT)) {
// Because the result and the input are different vector types, widening
// the result could create a legal type but widening the input might make
// it an illegal type that might lead to repeatedly splitting the input
// and then widening it. To avoid this, we widen the input only if
// it results in a legal type.
if (WidenNumElts % InVTNumElts == 0) {
// Widen the input and call convert on the widened input vector.
unsigned NumConcat = WidenNumElts/InVTNumElts;
SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
Ops[0] = InOp;
SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
if (N->getNumOperands() == 1)
return DAG.getNode(Opcode, DL, WidenVT, InVec);
return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags);
if (InVTNumElts % WidenNumElts == 0) {
SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
DAG.getVectorIdxConstant(0, DL));
// Extract the input and convert the shorten input vector.
if (N->getNumOperands() == 1)
return DAG.getNode(Opcode, DL, WidenVT, InVal);
return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags);
// Otherwise unroll into some nasty scalar code and rebuild the vector.
EVT EltVT = WidenVT.getVectorElementType();
SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
// Use the original element count so we don't do more scalar opts than
// necessary.
unsigned MinElts = N->getValueType(0).getVectorNumElements();
for (unsigned i=0; i < MinElts; ++i) {
DAG.getVectorIdxConstant(i, DL));
if (N->getNumOperands() == 1)
Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
return DAG.getBuildVector(WidenVT, DL, Ops);
SDValue DAGTypeLegalizer::WidenVecRes_Convert_StrictFP(SDNode *N) {
SDValue InOp = N->getOperand(1);
SDLoc DL(N);
SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
unsigned WidenNumElts = WidenVT.getVectorNumElements();
EVT InVT = InOp.getValueType();
EVT InEltVT = InVT.getVectorElementType();
unsigned Opcode = N->getOpcode();
// FIXME: Optimizations need to be implemented here.
// Otherwise unroll into some nasty scalar code and rebuild the vector.
EVT EltVT = WidenVT.getVectorElementType();
std::array<EVT, 2> EltVTs = {{EltVT, MVT::Other}};
SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
SmallVector<SDValue, 32> OpChains;
// Use the original element count so we don't do more scalar opts than
// necessary.
unsigned MinElts = N->getValueType(0).getVectorNumElements();
for (unsigned i=0; i < MinElts; ++i) {
NewOps[1] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
DAG.getVectorIdxConstant(i, DL));
Ops[i] = DAG.getNode(Opcode, DL, EltVTs, NewOps);
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OpChains);
ReplaceValueWith(SDValue(N, 1), NewChain);
return DAG.getBuildVector(WidenVT, DL, Ops);
SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
unsigned Opcode = N->getOpcode();
SDValue InOp = N->getOperand(0);
SDLoc DL(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
EVT WidenSVT = WidenVT.getVectorElementType();
unsigned WidenNumElts = WidenVT.getVectorNumElements();
EVT InVT = InOp.getValueType();
EVT InSVT = InVT.getVectorElementType();
unsigned InVTNumElts = InVT.getVectorNumElements();
if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
InOp = GetWidenedVector(InOp);
InVT = InOp.getValueType();
if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
switch (Opcode) {
return DAG.getNode(Opcode, DL, WidenVT, InOp);
// Unroll, extend the scalars and rebuild the vector.
SmallVector<SDValue, 16> Ops;
for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
DAG.getVectorIdxConstant(i, DL));
switch (Opcode) {
Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val);
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val);
llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected");
while (Ops.size() != WidenNumElts)
return DAG.getBuildVector(WidenVT, DL, Ops);
SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
// If this is an FCOPYSIGN with same input types, we can treat it as a
// normal (can trap) binary op.
if (N->getOperand(0).getValueType() == N->getOperand(1).getValueType())
return WidenVecRes_BinaryCanTrap(N);
// If the types are different, fall back to unrolling.
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue InOp = GetWidenedVector(N->getOperand(0));
SDValue ShOp = N->getOperand(1);
return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue InOp = GetWidenedVector(N->getOperand(0));
SDValue ShOp = N->getOperand(1);
EVT ShVT = ShOp.getValueType();
if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
ShOp = GetWidenedVector(ShOp);
ShVT = ShOp.getValueType();
EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(),
if (ShVT != ShWidenVT)
ShOp = ModifyToType(ShOp, ShWidenVT);
return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
// Unary op widening.
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue InOp = GetWidenedVector(N->getOperand(0));
return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp);
SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
return DAG.getNode(N->getOpcode(), SDLoc(N),
WidenVT, WidenLHS, DAG.getValueType(ExtVT));
SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo);
return GetWidenedVector(WidenVec);
SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
SDValue InOp = N->getOperand(0);
EVT InVT = InOp.getValueType();
EVT VT = N->getValueType(0);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
SDLoc dl(N);
switch (getTypeAction(InVT)) {
case TargetLowering::TypeLegal:
case TargetLowering::TypeScalarizeScalableVector:
report_fatal_error("Scalarization of scalable vectors is not supported.");
case TargetLowering::TypePromoteInteger: {
// If the incoming type is a vector that is being promoted, then
// we know that the elements are arranged differently and that we
// must perform the conversion using a stack slot.
if (InVT.isVector())
// If the InOp is promoted to the same size, convert it. Otherwise,
// fall out of the switch and widen the promoted input.
SDValue NInOp = GetPromotedInteger(InOp);
EVT NInVT = NInOp.getValueType();
if (WidenVT.bitsEq(NInVT)) {
// For big endian targets we need to shift the input integer or the
// interesting bits will end up at the wrong place.
if (DAG.getDataLayout().isBigEndian()) {
unsigned ShiftAmt = NInVT.getSizeInBits() - InVT.getSizeInBits();
EVT ShiftAmtTy = TLI.getShiftAmountTy(NInVT, DAG.getDataLayout());
assert(ShiftAmt < WidenVT.getSizeInBits() && "Too large shift amount!");
NInOp = DAG.getNode(ISD::SHL, dl, NInVT, NInOp,
DAG.getConstant(ShiftAmt, dl, ShiftAmtTy));
return DAG.getNode(ISD::BITCAST, dl, WidenVT, NInOp);
InOp = NInOp;
case TargetLowering::TypeSoftenFloat:
case TargetLowering::TypePromoteFloat:
case TargetLowering::TypeSoftPromoteHalf:
case TargetLowering::TypeExpandInteger:
case TargetLowering::TypeExpandFloat:
case TargetLowering::TypeScalarizeVector:
case TargetLowering::TypeSplitVector:
case TargetLowering::TypeWidenVector:
// If the InOp is widened to the same size, convert it. Otherwise, fall
// out of the switch and widen the widened input.
InOp = GetWidenedVector(InOp);
InVT = InOp.getValueType();
if (WidenVT.bitsEq(InVT))
// The input widens to the same size. Convert to the widen value.
return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
unsigned WidenSize = WidenVT.getSizeInBits();
unsigned InSize = InVT.getSizeInBits();
// x86mmx is not an acceptable vector element type, so don't try.
if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) {
// Determine new input vector type. The new input vector type will use
// the same element type (if its a vector) or use the input type as a
// vector. It is the same size as the type to widen to.
unsigned NewNumElts = WidenSize / InSize;
if (InVT.isVector()) {
EVT InEltVT = InVT.getVectorElementType();
NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
WidenSize / InEltVT.getSizeInBits());
} else {
NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
if (TLI.isTypeLegal(NewInVT)) {
SDValue NewVec;
if (InVT.isVector()) {
// Because the result and the input are different vector types, widening
// the result could create a legal type but widening the input might make
// it an illegal type that might lead to repeatedly splitting the input
// and then widening it. To avoid this, we widen the input only if
// it results in a legal type.
SmallVector<SDValue, 16> Ops(NewNumElts, DAG.getUNDEF(InVT));
Ops[0] = InOp;
NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
} else {
NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp);
return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
return CreateStackStoreLoad(InOp, WidenVT);
SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
SDLoc dl(N);
// Build a vector with undefined for the new nodes.
EVT VT = N->getValueType(0);
// Integer BUILD_VECTOR operands may be larger than the node's vector element
// type. The UNDEFs need to have the same type as the existing operands.
EVT EltVT = N->getOperand(0).getValueType();
unsigned NumElts = VT.getVectorNumElements();
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
return DAG.getBuildVector(WidenVT, dl, NewOps);
SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
EVT InVT = N->getOperand(0).getValueType();
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDLoc dl(N);
- unsigned WidenNumElts = WidenVT.getVectorNumElements();
- unsigned NumInElts = InVT.getVectorNumElements();
unsigned NumOperands = N->getNumOperands();
bool InputWidened = false; // Indicates we need to widen the input.
if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
- if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
+ unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
+ unsigned NumInElts = InVT.getVectorMinNumElements();
+ if (WidenNumElts % NumInElts == 0) {
// Add undef vectors to widen to correct length.
- unsigned NumConcat = WidenVT.getVectorNumElements() /
- InVT.getVectorNumElements();
+ unsigned NumConcat = WidenNumElts / NumInElts;
SDValue UndefVal = DAG.getUNDEF(InVT);
SmallVector<SDValue, 16> Ops(NumConcat);
for (unsigned i=0; i < NumOperands; ++i)
Ops[i] = N->getOperand(i);
for (unsigned i = NumOperands; i != NumConcat; ++i)
Ops[i] = UndefVal;
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Ops);
} else {
InputWidened = true;
if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
// The inputs and the result are widen to the same value.
unsigned i;
for (i=1; i < NumOperands; ++i)
if (!N->getOperand(i).isUndef())
if (i == NumOperands)
// Everything but the first operand is an UNDEF so just return the
// widened first operand.
return GetWidenedVector(N->getOperand(0));
if (NumOperands == 2) {
+ assert(!WidenVT.isScalableVector() &&
+ "Cannot use vector shuffles to widen CONCAT_VECTOR result");
+ unsigned WidenNumElts = WidenVT.getVectorNumElements();
+ unsigned NumInElts = InVT.getVectorNumElements();
// Replace concat of two operands with a shuffle.
SmallVector<int, 16> MaskOps(WidenNumElts, -1);
for (unsigned i = 0; i < NumInElts; ++i) {
MaskOps[i] = i;
MaskOps[i + NumInElts] = i + WidenNumElts;
return DAG.getVectorShuffle(WidenVT, dl,
+ assert(!WidenVT.isScalableVector() &&
+ "Cannot use build vectors to widen CONCAT_VECTOR result");
+ unsigned WidenNumElts = WidenVT.getVectorNumElements();
+ unsigned NumInElts = InVT.getVectorNumElements();
// Fall back to use extracts and build vector.
EVT EltVT = WidenVT.getVectorElementType();
SmallVector<SDValue, 16> Ops(WidenNumElts);
unsigned Idx = 0;
for (unsigned i=0; i < NumOperands; ++i) {
SDValue InOp = N->getOperand(i);
if (InputWidened)
InOp = GetWidenedVector(InOp);
for (unsigned j = 0; j < NumInElts; ++j)
Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
DAG.getVectorIdxConstant(j, dl));
SDValue UndefVal = DAG.getUNDEF(EltVT);
for (; Idx < WidenNumElts; ++Idx)
Ops[Idx] = UndefVal;
return DAG.getBuildVector(WidenVT, dl, Ops);
SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
EVT VT = N->getValueType(0);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SDValue InOp = N->getOperand(0);
SDValue Idx = N->getOperand(1);
SDLoc dl(N);
if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
InOp = GetWidenedVector(InOp);
EVT InVT = InOp.getValueType();
// Check if we can just return the input vector after widening.
uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0 && InVT == WidenVT)
return InOp;
// Check if we can extract from the vector.
unsigned InNumElts = InVT.getVectorNumElements();
if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
// We could try widening the input to the right length but for now, extract
// the original elements, fill the rest with undefs and build a vector.
SmallVector<SDValue, 16> Ops(WidenNumElts);
EVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
unsigned i;
for (i = 0; i < NumElts; ++i)
Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
DAG.getVectorIdxConstant(IdxVal + i, dl));
SDValue UndefVal = DAG.getUNDEF(EltVT);
for (; i < WidenNumElts; ++i)
Ops[i] = UndefVal;
return DAG.getBuildVector(WidenVT, dl, Ops);
SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
SDValue InOp = GetWidenedVector(N->getOperand(0));
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N),
InOp.getValueType(), InOp,
N->getOperand(1), N->getOperand(2));
SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
LoadSDNode *LD = cast<LoadSDNode>(N);
ISD::LoadExtType ExtType = LD->getExtensionType();
// A vector must always be stored in memory as-is, i.e. without any padding
// between the elements, since various code depend on it, e.g. in the
// handling of a bitcast of a vector type to int, which may be done with a
// vector store followed by an integer load. A vector that does not have
// elements that are byte-sized must therefore be stored as an integer
// built out of the extracted vector elements.
if (!LD->getMemoryVT().isByteSized()) {
SDValue Value, NewChain;
std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
ReplaceValueWith(SDValue(LD, 0), Value);
ReplaceValueWith(SDValue(LD, 1), NewChain);
return SDValue();
SDValue Result;
SmallVector<SDValue, 16> LdChain; // Chain for the series of load
if (ExtType != ISD::NON_EXTLOAD)
Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
Result = GenWidenVectorLoads(LdChain, LD);
// If we generate a single load, we can use that for the chain. Otherwise,
// build a factor node to remember the multiple loads are independent and
// chain to that.
SDValue NewChain;
if (LdChain.size() == 1)
NewChain = LdChain[0];
NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
// Modified the chain - switch anything that used the old chain to use
// the new one.
ReplaceValueWith(SDValue(N, 1), NewChain);
return Result;
SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
SDValue Mask = N->getMask();
EVT MaskVT = Mask.getValueType();
SDValue PassThru = GetWidenedVector(N->getPassThru());
ISD::LoadExtType ExtType = N->getExtensionType();
SDLoc dl(N);
// The mask should be widened as well
EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
Mask = ModifyToType(Mask, WideMaskVT, true);
SDValue Res = DAG.getMaskedLoad(
WidenVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
ExtType, N->isExpandingLoad());
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
return Res;
SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue Mask = N->getMask();
EVT MaskVT = Mask.getValueType();
SDValue PassThru = GetWidenedVector(N->getPassThru());
SDValue Scale = N->getScale();
unsigned NumElts = WideVT.getVectorNumElements();
SDLoc dl(N);
// The mask should be widened as well
EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
Mask = ModifyToType(Mask, WideMaskVT, true);
// Widen the Index operand
SDValue Index = N->getIndex();
EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
Index = ModifyToType(Index, WideIndexVT);
SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
Scale };
SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
N->getMemoryVT(), dl, Ops,
N->getMemOperand(), N->getIndexType());
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
return Res;
SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
WidenVT, N->getOperand(0));
// Return true is this is a SETCC node or a strict version of it.
static inline bool isSETCCOp(unsigned Opcode) {
switch (Opcode) {
case ISD::SETCC:
return true;
return false;
// Return true if this is a node that could have two SETCCs as operands.
static inline bool isLogicalMaskOp(unsigned Opcode) {
switch (Opcode) {
case ISD::AND:
case ISD::OR:
case ISD::XOR:
return true;
return false;
// If N is a SETCC or a strict variant of it, return the type
// of the compare operands.
static inline EVT getSETCCOperandType(SDValue N) {
unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
return N->getOperand(OpNo).getValueType();
// This is used just for the assert in convertMask(). Check that this either
// a SETCC or a previously handled SETCC by convertMask().
#ifndef NDEBUG
static inline bool isSETCCorConvertedSETCC(SDValue N) {
if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
N = N.getOperand(0);
else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
for (unsigned i = 1; i < N->getNumOperands(); ++i)
if (!N->getOperand(i)->isUndef())
return false;
N = N.getOperand(0);
if (N.getOpcode() == ISD::TRUNCATE)
N = N.getOperand(0);
else if (N.getOpcode() == ISD::SIGN_EXTEND)
N = N.getOperand(0);
if (isLogicalMaskOp(N.getOpcode()))
return isSETCCorConvertedSETCC(N.getOperand(0)) &&
return (isSETCCOp(N.getOpcode()) ||
// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
// to ToMaskVT if needed with vector extension or truncation.
SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
EVT ToMaskVT) {
// Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
// FIXME: This code seems to be too restrictive, we might consider
// generalizing it or dropping it.
assert(isSETCCorConvertedSETCC(InMask) && "Unexpected mask argument.");
// Make a new Mask node, with a legal result VT.
SDValue Mask;
SmallVector<SDValue, 4> Ops;
for (unsigned i = 0, e = InMask->getNumOperands(); i < e; ++i)
if (InMask->isStrictFPOpcode()) {
Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask),
{ MaskVT, MVT::Other }, Ops);
ReplaceValueWith(InMask.getValue(1), Mask.getValue(1));
Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask), MaskVT, Ops);
// If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
// extend or truncate is needed.
LLVMContext &Ctx = *DAG.getContext();
unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
if (MaskScalarBits < ToMaskScalBits) {
EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
} else if (MaskScalarBits > ToMaskScalBits) {
EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
assert(Mask->getValueType(0).getScalarSizeInBits() ==
ToMaskVT.getScalarSizeInBits() &&
"Mask should have the right element size by now.");
// Adjust Mask to the right number of elements.
unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(Mask));
Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
} else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
EVT SubVT = Mask->getValueType(0);
SmallVector<SDValue, 16> SubOps(NumSubVecs, DAG.getUNDEF(SubVT));
SubOps[0] = Mask;
Mask = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubOps);
assert((Mask->getValueType(0) == ToMaskVT) &&
"A mask of ToMaskVT should have been produced by now.");
return Mask;
// This method tries to handle VSELECT and its mask by legalizing operands
// (which may require widening) and if needed adjusting the mask vector type
// to match that of the VSELECT. Without it, many cases end up with
// scalarization of the SETCC, with many unnecessary instructions.
SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
LLVMContext &Ctx = *DAG.getContext();
SDValue Cond = N->getOperand(0);
if (N->getOpcode() != ISD::VSELECT)
return SDValue();
if (!isSETCCOp(Cond->getOpcode()) && !isLogicalMaskOp(Cond->getOpcode()))
return SDValue();
// If this is a splitted VSELECT that was previously already handled, do
// nothing.
EVT CondVT = Cond->getValueType(0);
if (CondVT.getScalarSizeInBits() != 1)
return SDValue();
EVT VSelVT = N->getValueType(0);
// Only handle vector types which are a power of 2.
if (!isPowerOf2_64(VSelVT.getSizeInBits()))
return SDValue();
// Don't touch if this will be scalarized.
EVT FinalVT = VSelVT;
while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
FinalVT = FinalVT.getHalfNumVectorElementsVT(Ctx);
if (FinalVT.getVectorNumElements() == 1)
return SDValue();
// If there is support for an i1 vector mask, don't touch.
if (isSETCCOp(Cond.getOpcode())) {
EVT SetCCOpVT = getSETCCOperandType(Cond);
while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
if (SetCCResVT.getScalarSizeInBits() == 1)
return SDValue();
} else if (CondVT.getScalarType() == MVT::i1) {
// If there is support for an i1 vector mask (or only scalar i1 conditions),
// don't touch.
while (TLI.getTypeAction(Ctx, CondVT) != TargetLowering::TypeLegal)
CondVT = TLI.getTypeToTransformTo(Ctx, CondVT);
if (CondVT.getScalarType() == MVT::i1)
return SDValue();
// Get the VT and operands for VSELECT, and widen if needed.
SDValue VSelOp1 = N->getOperand(1);
SDValue VSelOp2 = N->getOperand(2);
if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
VSelOp1 = GetWidenedVector(VSelOp1);
VSelOp2 = GetWidenedVector(VSelOp2);
// The mask of the VSELECT should have integer elements.
EVT ToMaskVT = VSelVT;
if (!ToMaskVT.getScalarType().isInteger())
ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();
SDValue Mask;
if (isSETCCOp(Cond->getOpcode())) {
EVT MaskVT = getSetCCResultType(getSETCCOperandType(Cond));
Mask = convertMask(Cond, MaskVT, ToMaskVT);
} else if (isLogicalMaskOp(Cond->getOpcode()) &&
isSETCCOp(Cond->getOperand(0).getOpcode()) &&
isSETCCOp(Cond->getOperand(1).getOpcode())) {
// Cond is (AND/OR/XOR (SETCC, SETCC))
SDValue SETCC0 = Cond->getOperand(0);
SDValue SETCC1 = Cond->getOperand(1);
EVT VT0 = getSetCCResultType(getSETCCOperandType(SETCC0));
EVT VT1 = getSetCCResultType(getSETCCOperandType(SETCC1));
unsigned ScalarBits0 = VT0.getScalarSizeInBits();
unsigned ScalarBits1 = VT1.getScalarSizeInBits();
unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
// If the two SETCCs have different VTs, either extend/truncate one of
// them to the other "towards" ToMaskVT, or truncate one and extend the
// other to ToMaskVT.
if (ScalarBits0 != ScalarBits1) {
EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
MaskVT = WideVT;
else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
MaskVT = NarrowVT;
MaskVT = ToMaskVT;
} else
// If the two SETCCs have the same VT, don't change it.
MaskVT = VT0;
// Make new SETCCs and logical nodes.
SETCC0 = convertMask(SETCC0, VT0, MaskVT);
SETCC1 = convertMask(SETCC1, VT1, MaskVT);
Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);
// Convert the logical op for VSELECT if needed.
Mask = convertMask(Cond, MaskVT, ToMaskVT);
} else
return SDValue();
return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SDValue Cond1 = N->getOperand(0);
EVT CondVT = Cond1.getValueType();
if (CondVT.isVector()) {
if (SDValue Res = WidenVSELECTAndMask(N))
return Res;
EVT CondEltVT = CondVT.getVectorElementType();
EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(),
CondEltVT, WidenNumElts);
if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
Cond1 = GetWidenedVector(Cond1);
// If we have to split the condition there is no point in widening the
// select. This would result in an cycle of widening the select ->
// widening the condition operand -> splitting the condition operand ->
// splitting the select -> widening the select. Instead split this select
// further and widen the resulting type.
if (getTypeAction(CondVT) == TargetLowering::TypeSplitVector) {
SDValue SplitSelect = SplitVecOp_VSELECT(N, 0);
SDValue Res = ModifyToType(SplitSelect, WidenVT);
return Res;
if (Cond1.getValueType() != CondWidenVT)
Cond1 = ModifyToType(Cond1, CondWidenVT);
SDValue InOp1 = GetWidenedVector(N->getOperand(1));
SDValue InOp2 = GetWidenedVector(N->getOperand(2));
assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
return DAG.getNode(N->getOpcode(), SDLoc(N),
WidenVT, Cond1, InOp1, InOp2);
SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
SDValue InOp1 = GetWidenedVector(N->getOperand(2));
SDValue InOp2 = GetWidenedVector(N->getOperand(3));
return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
InOp1.getValueType(), N->getOperand(0),
N->getOperand(1), InOp1, InOp2, N->getOperand(4));
SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
return DAG.getUNDEF(WidenVT);
SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
EVT VT = N->getValueType(0);
SDLoc dl(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
unsigned NumElts = VT.getVectorNumElements();
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SDValue InOp1 = GetWidenedVector(N->getOperand(0));
SDValue InOp2 = GetWidenedVector(N->getOperand(1));
// Adjust mask based on new input vector length.
SmallVector<int, 16> NewMask;
for (unsigned i = 0; i != NumElts; ++i) {
int Idx = N->getMaskElt(i);
if (Idx < (int)NumElts)
NewMask.push_back(Idx - NumElts + WidenNumElts);
for (unsigned i = NumElts; i != WidenNumElts; ++i)
return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, NewMask);
SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
assert(N->getValueType(0).isVector() &&
N->getOperand(0).getValueType().isVector() &&
"Operands must be vectors");
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SDValue InOp1 = N->getOperand(0);
EVT InVT = InOp1.getValueType();
assert(InVT.isVector() && "can not widen non-vector type");
EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(),
InVT.getVectorElementType(), WidenNumElts);
// The input and output types often differ here, and it could be that while
// we'd prefer to widen the result type, the input operands have been split.
// In this case, we also need to split the result of this node as well.
if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
SDValue SplitVSetCC = SplitVecOp_VSETCC(N);
SDValue Res = ModifyToType(SplitVSetCC, WidenVT);
return Res;
// If the inputs also widen, handle them directly. Otherwise widen by hand.
SDValue InOp2 = N->getOperand(1);
if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
InOp1 = GetWidenedVector(InOp1);
InOp2 = GetWidenedVector(InOp2);
} else {
InOp1 = DAG.WidenVector(InOp1, SDLoc(N));
InOp2 = DAG.WidenVector(InOp2, SDLoc(N));
// Assume that the input and output will be widen appropriately. If not,
// we will have to unroll it at some point.
assert(InOp1.getValueType() == WidenInVT &&
InOp2.getValueType() == WidenInVT &&
"Input not widened to expected type!");
return DAG.getNode(ISD::SETCC, SDLoc(N),
WidenVT, InOp1, InOp2, N->getOperand(2));
SDValue DAGTypeLegalizer::WidenVecRes_STRICT_FSETCC(SDNode *N) {
assert(N->getValueType(0).isVector() &&
N->getOperand(1).getValueType().isVector() &&
"Operands must be vectors");
EVT VT = N->getValueType(0);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
unsigned WidenNumElts = WidenVT.getVectorNumElements();
unsigned NumElts = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
SDLoc dl(N);
SDValue Chain = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
SDValue CC = N->getOperand(3);
EVT TmpEltVT = LHS.getValueType().getVectorElementType();
// Fully unroll and reassemble.
SmallVector<SDValue, 8> Scalars(WidenNumElts, DAG.getUNDEF(EltVT));
SmallVector<SDValue, 8> Chains(NumElts);
for (unsigned i = 0; i != NumElts; ++i) {
DAG.getVectorIdxConstant(i, dl));
DAG.getVectorIdxConstant(i, dl));
Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other},
{Chain, LHSElem, RHSElem, CC});
Chains[i] = Scalars[i].getValue(1);
Scalars[i] = DAG.getSelect(dl, EltVT, Scalars[i],
DAG.getBoolConstant(true, dl, EltVT, VT),
DAG.getBoolConstant(false, dl, EltVT, VT));
SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
ReplaceValueWith(SDValue(N, 1), NewChain);
return DAG.getBuildVector(WidenVT, dl, Scalars);
// Widen Vector Operand
bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
LLVM_DEBUG(dbgs() << "Widen node operand " << OpNo << ": "; N->dump(&DAG);
dbgs() << "\n");
SDValue Res = SDValue();
// See if the target wants to custom widen this node.
if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
return false;
switch (N->getOpcode()) {
#ifndef NDEBUG
dbgs() << "WidenVectorOperand op #" << OpNo << ": ";
dbgs() << "\n";
llvm_unreachable("Do not know how to widen this operator's operand!");
case ISD::BITCAST: Res = WidenVecOp_BITCAST(N); break;
case ISD::CONCAT_VECTORS: Res = WidenVecOp_CONCAT_VECTORS(N); break;
case ISD::STORE: Res = WidenVecOp_STORE(N); break;
case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break;
case ISD::MGATHER: Res = WidenVecOp_MGATHER(N, OpNo); break;
case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break;
case ISD::SETCC: Res = WidenVecOp_SETCC(N); break;
case ISD::STRICT_FSETCCS: Res = WidenVecOp_STRICT_FSETCC(N); break;
case ISD::VSELECT: Res = WidenVecOp_VSELECT(N); break;
case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break;
Res = WidenVecOp_EXTEND(N);
Res = WidenVecOp_Convert(N);
Res = WidenVecOp_VECREDUCE(N);
// If Res is null, the sub-method took care of registering the result.
if (!Res.getNode()) return false;
// If the result is N, the sub-method updated N in place. Tell the legalizer
// core about this.
if (Res.getNode() == N)
return true;
if (N->isStrictFPOpcode())
assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 2 &&
"Invalid operand expansion");
assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
"Invalid operand expansion");
ReplaceValueWith(SDValue(N, 0), Res);
return false;
SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue InOp = N->getOperand(0);
assert(getTypeAction(InOp.getValueType()) ==
TargetLowering::TypeWidenVector &&
"Unexpected type action");
InOp = GetWidenedVector(InOp);
assert(VT.getVectorNumElements() <
InOp.getValueType().getVectorNumElements() &&
"Input wasn't widened!");
// We may need to further widen the operand until it has the same total
// vector size as the result.
EVT InVT = InOp.getValueType();
if (InVT.getSizeInBits() != VT.getSizeInBits()) {
EVT InEltVT = InVT.getVectorElementType();
EVT FixedVT = (MVT::SimpleValueType)i;
EVT FixedEltVT = FixedVT.getVectorElementType();
if (TLI.isTypeLegal(FixedVT) &&
FixedVT.getSizeInBits() == VT.getSizeInBits() &&
FixedEltVT == InEltVT) {
assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() &&
"Not enough elements in the fixed type for the operand!");
assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
"We can't have the same type as we started with!");
if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
DAG.getUNDEF(FixedVT), InOp,
DAG.getVectorIdxConstant(0, DL));
DAG.getVectorIdxConstant(0, DL));
InVT = InOp.getValueType();
if (InVT.getSizeInBits() != VT.getSizeInBits())
// We couldn't find a legal vector type that was a widening of the input
// and could be extended in-register to the result type, so we have to
// scalarize.
return WidenVecOp_Convert(N);
// Use special DAG nodes to represent the operation of extending the
// low lanes.
switch (N->getOpcode()) {
llvm_unreachable("Extend legalization on extend operation!");
SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) {
// The result (and first input) is legal, but the second input is illegal.
// We can't do much to fix that, so just unroll and let the extracts off of
// the second input be widened as needed later.
return DAG.UnrollVectorOp(N);
SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
// Since the result is legal and the input is illegal.
EVT VT = N->getValueType(0);
EVT EltVT = VT.getVectorElementType();
SDLoc dl(N);
unsigned NumElts = VT.getVectorNumElements();
SDValue InOp = N->getOperand(N->isStrictFPOpcode() ? 1 : 0);
assert(getTypeAction(InOp.getValueType()) ==
TargetLowering::TypeWidenVector &&
"Unexpected type action");
InOp = GetWidenedVector(InOp);
EVT InVT = InOp.getValueType();
unsigned Opcode = N->getOpcode();
// See if a widened result type would be legal, if so widen the node.
// FIXME: This isn't safe for StrictFP. Other optimization here is needed.
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
if (TLI.isTypeLegal(WideVT) && !N->isStrictFPOpcode()) {
SDValue Res;
if (N->isStrictFPOpcode()) {
if (Opcode == ISD::STRICT_FP_ROUND)
Res = DAG.getNode(Opcode, dl, { WideVT, MVT::Other },
{ N->getOperand(0), InOp, N->getOperand(2) });
Res = DAG.getNode(Opcode, dl, { WideVT, MVT::Other },
{ N->getOperand(0), InOp });
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
} else {
if (Opcode == ISD::FP_ROUND)
Res = DAG.getNode(Opcode, dl, WideVT, InOp, N->getOperand(1));
Res = DAG.getNode(Opcode, dl, WideVT, InOp);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
DAG.getVectorIdxConstant(0, dl));
EVT InEltVT = InVT.getVectorElementType();
// Unroll the convert into some scalar code and create a nasty build vector.
SmallVector<SDValue, 16> Ops(NumElts);
if (N->isStrictFPOpcode()) {
SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
SmallVector<SDValue, 32> OpChains;
for (unsigned i=0; i < NumElts; ++i) {
NewOps[1] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
DAG.getVectorIdxConstant(i, dl));
Ops[i] = DAG.getNode(Opcode, dl, { EltVT, MVT::Other }, NewOps);
SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OpChains);
ReplaceValueWith(SDValue(N, 1), NewChain);
} else {
for (unsigned i = 0; i < NumElts; ++i)
Ops[i] = DAG.getNode(Opcode, dl, EltVT,
InOp, DAG.getVectorIdxConstant(i, dl)));
return DAG.getBuildVector(VT, dl, Ops);
SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
EVT VT = N->getValueType(0);
SDValue InOp = GetWidenedVector(N->getOperand(0));
EVT InWidenVT = InOp.getValueType();
SDLoc dl(N);
// Check if we can convert between two legal vector types and extract.
unsigned InWidenSize = InWidenVT.getSizeInBits();
unsigned Size = VT.getSizeInBits();
// x86mmx is not an acceptable vector element type, so don't try.
if (InWidenSize % Size == 0 && !VT.isVector() && VT != MVT::x86mmx) {
unsigned NewNumElts = InWidenSize / Size;
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
if (TLI.isTypeLegal(NewVT)) {
SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
DAG.getVectorIdxConstant(0, dl));
// Handle a case like bitcast v12i8 -> v3i32. Normally that would get widened
// to v16i8 -> v4i32, but for a target where v3i32 is legal but v12i8 is not,
// we end up here. Handling the case here with EXTRACT_SUBVECTOR avoids
// having to copy via memory.
if (VT.isVector()) {
EVT EltVT = VT.getVectorElementType();
unsigned EltSize = EltVT.getSizeInBits();
if (InWidenSize % EltSize == 0) {
unsigned NewNumElts = InWidenSize / EltSize;
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NewNumElts);
if (TLI.isTypeLegal(NewVT)) {
SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, BitOp,
DAG.getVectorIdxConstant(0, dl));
return CreateStackStoreLoad(InOp, VT);
SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
EVT VT = N->getValueType(0);
EVT EltVT = VT.getVectorElementType();
EVT InVT = N->getOperand(0).getValueType();
SDLoc dl(N);
// If the widen width for this operand is the same as the width of the concat
// and all but the first operand is undef, just use the widened operand.
unsigned NumOperands = N->getNumOperands();
if (VT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
unsigned i;
for (i = 1; i < NumOperands; ++i)
if (!N->getOperand(i).isUndef())
if (i == NumOperands)
return GetWidenedVector(N->getOperand(0));
// Otherwise, fall back to a nasty build vector.
unsigned NumElts = VT.getVectorNumElements();
SmallVector<SDValue, 16> Ops(NumElts);
unsigned NumInElts = InVT.getVectorNumElements();
unsigned Idx = 0;
for (unsigned i=0; i < NumOperands; ++i) {
SDValue InOp = N->getOperand(i);
assert(getTypeAction(InOp.getValueType()) ==
TargetLowering::TypeWidenVector &&
"Unexpected type action");
InOp = GetWidenedVector(InOp);
for (unsigned j = 0; j < NumInElts; ++j)
Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
DAG.getVectorIdxConstant(j, dl));
return DAG.getBuildVector(VT, dl, Ops);
SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
SDValue InOp = GetWidenedVector(N->getOperand(0));
N->getValueType(0), InOp, N->getOperand(1));
SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
SDValue InOp = GetWidenedVector(N->getOperand(0));
N->getValueType(0), InOp, N->getOperand(1));
SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
// We have to widen the value, but we want only to store the original
// vector type.
StoreSDNode *ST = cast<StoreSDNode>(N);
if (!ST->getMemoryVT().getScalarType().isByteSized())
return TLI.scalarizeVectorStore(ST, DAG);
SmallVector<SDValue, 16> StChain;
if (ST->isTruncatingStore())
GenWidenVectorTruncStores(StChain, ST);
GenWidenVectorStores(StChain, ST);
if (StChain.size() == 1)
return StChain[0];
return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
assert((OpNo == 1 || OpNo == 3) &&
"Can widen only data or mask operand of mstore");
MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
SDValue Mask = MST->getMask();
EVT MaskVT = Mask.getValueType();
SDValue StVal = MST->getValue();
SDLoc dl(N);
if (OpNo == 1) {
// Widen the value.
StVal = GetWidenedVector(StVal);
// The mask should be widened as well.
EVT WideVT = StVal.getValueType();
EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
Mask = ModifyToType(Mask, WideMaskVT, true);
} else {
// Widen the mask.
EVT WideMaskVT = TLI.getTypeToTransformTo(*DAG.getContext(), MaskVT);
Mask = ModifyToType(Mask, WideMaskVT, true);
EVT ValueVT = StVal.getValueType();
EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
StVal = ModifyToType(StVal, WideVT);
assert(Mask.getValueType().getVectorNumElements() ==
StVal.getValueType().getVectorNumElements() &&
"Mask and data vectors should have the same number of elements");
return DAG.getMaskedStore(MST->getChain(), dl, StVal, MST->getBasePtr(),
MST->getOffset(), Mask, MST->getMemoryVT(),
MST->getMemOperand(), MST->getAddressingMode(),
false, MST->isCompressingStore());
SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) {
assert(OpNo == 4 && "Can widen only the index of mgather");
auto *MG = cast<MaskedGatherSDNode>(N);
SDValue DataOp = MG->getPassThru();
SDValue Mask = MG->getMask();
SDValue Scale = MG->getScale();
// Just widen the index. It's allowed to have extra elements.
SDValue Index = GetWidenedVector(MG->getIndex());
SDLoc dl(N);
SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index,
SDValue Res = DAG.getMaskedGather(MG->getVTList(), MG->getMemoryVT(), dl, Ops,
MG->getMemOperand(), MG->getIndexType());
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
return SDValue();
SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
SDValue DataOp = MSC->getValue();
SDValue Mask = MSC->getMask();
SDValue Index = MSC->getIndex();
SDValue Scale = MSC->getScale();
if (OpNo == 1) {
DataOp = GetWidenedVector(DataOp);
unsigned NumElts = DataOp.getValueType().getVectorNumElements();
// Widen index.
EVT IndexVT = Index.getValueType();
EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
IndexVT.getVectorElementType(), NumElts);
Index = ModifyToType(Index, WideIndexVT);
// The mask should be widened as well.
EVT MaskVT = Mask.getValueType();
EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
MaskVT.getVectorElementType(), NumElts);
Mask = ModifyToType(Mask, WideMaskVT, true);
} else if (OpNo == 4) {
// Just widen the index. It's allowed to have extra elements.
Index = GetWidenedVector(Index);
} else
llvm_unreachable("Can't widen this operand of mscatter");
SDValue Ops[] = {MSC->getChain(), DataOp, Mask, MSC->getBasePtr(), Index,
return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
MSC->getMemoryVT(), SDLoc(N), Ops,
MSC->getMemOperand(), MSC->getIndexType());
SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
SDValue InOp0 = GetWidenedVector(N->getOperand(0));
SDValue InOp1 = GetWidenedVector(N->getOperand(1));
SDLoc dl(N);
EVT VT = N->getValueType(0);
// WARNING: In this code we widen the compare instruction with garbage.
// This garbage may contain denormal floats which may be slow. Is this a real
// concern ? Should we zero the unused lanes if this is a float compare ?
// Get a new SETCC node to compare the newly widened operands.
// Only some of the compared elements are legal.
EVT SVT = getSetCCResultType(InOp0.getValueType());
// The result type is legal, if its vXi1, keep vXi1 for the new SETCC.
if (VT.getScalarType() == MVT::i1)
SVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
SVT, InOp0, InOp1, N->getOperand(2));
// Extract the needed results from the result vector.
EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
DAG.getVectorIdxConstant(0, dl));
EVT OpVT = N->getOperand(0).getValueType();
ISD::NodeType ExtendCode =
return DAG.getNode(ExtendCode, dl, VT, CC);
SDValue DAGTypeLegalizer::WidenVecOp_STRICT_FSETCC(SDNode *N) {
SDValue Chain = N->getOperand(0);
SDValue LHS = GetWidenedVector(N->getOperand(1));
SDValue RHS = GetWidenedVector(N->getOperand(2));
SDValue CC = N->getOperand(3);
SDLoc dl(N);
EVT VT = N->getValueType(0);
EVT EltVT = VT.getVectorElementType();
EVT TmpEltVT = LHS.getValueType().getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
// Unroll into a build vector.
SmallVector<SDValue, 8> Scalars(NumElts);
SmallVector<SDValue, 8> Chains(NumElts);
for (unsigned i = 0; i != NumElts; ++i) {
DAG.getVectorIdxConstant(i, dl));
DAG.getVectorIdxConstant(i, dl));
Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other},
{Chain, LHSElem, RHSElem, CC});
Chains[i] = Scalars[i].getValue(1);
Scalars[i] = DAG.getSelect(dl, EltVT, Scalars[i],
DAG.getBoolConstant(true, dl, EltVT, VT),
DAG.getBoolConstant(false, dl, EltVT, VT));
SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
ReplaceValueWith(SDValue(N, 1), NewChain);
return DAG.getBuildVector(VT, dl, Scalars);
SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
SDLoc dl(N);
SDValue Op = GetWidenedVector(N->getOperand(0));
EVT OrigVT = N->getOperand(0).getValueType();
EVT WideVT = Op.getValueType();
EVT ElemVT = OrigVT.getVectorElementType();
SDValue NeutralElem;
switch (N->getOpcode()) {
NeutralElem = DAG.getConstant(0, dl, ElemVT);
NeutralElem = DAG.getConstant(1, dl, ElemVT);
NeutralElem = DAG.getAllOnesConstant(dl, ElemVT);
NeutralElem = DAG.getConstant(
APInt::getSignedMinValue(ElemVT.getSizeInBits()), dl, ElemVT);
NeutralElem = DAG.getConstant(
APInt::getSignedMaxValue(ElemVT.getSizeInBits()), dl, ElemVT);
NeutralElem = DAG.getConstantFP(0.0, dl, ElemVT);
NeutralElem = DAG.getConstantFP(1.0, dl, ElemVT);
NeutralElem = DAG.getConstantFP(
-std::numeric_limits<double>::infinity(), dl, ElemVT);
NeutralElem = DAG.getConstantFP(
std::numeric_limits<double>::infinity(), dl, ElemVT);
// Pad the vector with the neutral element.
unsigned OrigElts = OrigVT.getVectorNumElements();
unsigned WideElts = WideVT.getVectorNumElements();
for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
DAG.getVectorIdxConstant(Idx, dl));
return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, N->getFlags());
SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) {
// This only gets called in the case that the left and right inputs and
// result are of a legal odd vector type, and the condition is illegal i1 of
// the same odd width that needs widening.
EVT VT = N->getValueType(0);
assert(VT.isVector() && !VT.isPow2VectorType() && isTypeLegal(VT));
SDValue Cond = GetWidenedVector(N->getOperand(0));
SDValue LeftIn = DAG.WidenVector(N->getOperand(1), SDLoc(N));
SDValue RightIn = DAG.WidenVector(N->getOperand(2), SDLoc(N));
SDLoc DL(N);
SDValue Select = DAG.getNode(N->getOpcode(), DL, LeftIn.getValueType(), Cond,
LeftIn, RightIn);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Select,
DAG.getVectorIdxConstant(0, DL));
// Vector Widening Utilities
// Utility function to find the type to chop up a widen vector for load/store
// TLI: Target lowering used to determine legal types.
// Width: Width left need to load/store.
// WidenVT: The widen vector type to load to/store from
// Align: If 0, don't allow use of a wider type
// WidenEx: If Align is not 0, the amount additional we can load/store from.
static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
unsigned Width, EVT WidenVT,
unsigned Align = 0, unsigned WidenEx = 0) {
EVT WidenEltVT = WidenVT.getVectorElementType();
const bool Scalable = WidenVT.isScalableVector();
unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinSize();
unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
unsigned AlignInBits = Align*8;
// If we have one element to load/store, return it.
EVT RetVT = WidenEltVT;
if (Width == WidenEltWidth)
return RetVT;
// See if there is larger legal integer than the element type to load/store.
unsigned VT;
// Don't bother looking for an integer type if the vector is scalable, skip
// to vector types.
if (!Scalable) {
EVT MemVT((MVT::SimpleValueType) VT);
unsigned MemVTWidth = MemVT.getSizeInBits();
if (MemVT.getSizeInBits() <= WidenEltWidth)
auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
if ((Action == TargetLowering::TypeLegal ||
Action == TargetLowering::TypePromoteInteger) &&
(WidenWidth % MemVTWidth) == 0 &&
isPowerOf2_32(WidenWidth / MemVTWidth) &&
(MemVTWidth <= Width ||
(Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
if (MemVTWidth == WidenWidth)
return MemVT;
RetVT = MemVT;
// See if there is a larger vector type to load/store that has the same vector
// element type and is evenly divisible with the WidenVT.
EVT MemVT = (MVT::SimpleValueType) VT;
// Skip vector MVTs which don't match the scalable property of WidenVT.
if (Scalable != MemVT.isScalableVector())
unsigned MemVTWidth = MemVT.getSizeInBits().getKnownMinSize();
auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
if ((Action == TargetLowering::TypeLegal ||
Action == TargetLowering::TypePromoteInteger) &&
WidenEltVT == MemVT.getVectorElementType() &&
(WidenWidth % MemVTWidth) == 0 &&
isPowerOf2_32(WidenWidth / MemVTWidth) &&
(MemVTWidth <= Width ||
(Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
return MemVT;
return RetVT;
// Builds a vector type from scalar loads
// VecTy: Resulting Vector type
// LDOps: Load operators to build a vector type
// [Start,End) the list of loads to use.
static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
SmallVectorImpl<SDValue> &LdOps,
unsigned Start, unsigned End) {
SDLoc dl(LdOps[Start]);
EVT LdTy = LdOps[Start].getValueType();
unsigned Width = VecTy.getSizeInBits();
unsigned NumElts = Width / LdTy.getSizeInBits();
EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);
unsigned Idx = 1;
SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);
for (unsigned i = Start + 1; i != End; ++i) {
EVT NewLdTy = LdOps[i].getValueType();
if (NewLdTy != LdTy) {
NumElts = Width / NewLdTy.getSizeInBits();
NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp);
// Readjust position and vector position based on new load type.
Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
LdTy = NewLdTy;
VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
DAG.getVectorIdxConstant(Idx++, dl));
return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
LoadSDNode *LD) {
// The strategy assumes that we can efficiently load power-of-two widths.
// The routine chops the vector into the largest vector loads with the same
// element type or scalar loads and then recombines it to the widen vector
// type.
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
unsigned WidenWidth = WidenVT.getSizeInBits();
EVT LdVT = LD->getMemoryVT();
SDLoc dl(LD);
assert(LdVT.isVector() && WidenVT.isVector());
assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
// Load information
SDValue Chain = LD->getChain();
SDValue BasePtr = LD->getBasePtr();
MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
AAMDNodes AAInfo = LD->getAAInfo();
int LdWidth = LdVT.getSizeInBits();
int WidthDiff = WidenWidth - LdWidth;
- // Allow wider loads.
+ // Allow wider loads if they are sufficiently aligned to avoid memory faults
+ // and if the original load is simple.
unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
// Find the vector type that can load from.
EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
int NewVTWidth = NewVT.getSizeInBits();
SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
LD->getOriginalAlign(), MMOFlags, AAInfo);
// Check if we can load the element with one instruction.
if (LdWidth <= NewVTWidth) {
if (!NewVT.isVector()) {
unsigned NumElts = WidenWidth / NewVTWidth;
EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
if (NewVT == WidenVT)
return LdOp;
assert(WidenWidth % NewVTWidth == 0);
unsigned NumConcat = WidenWidth / NewVTWidth;
SmallVector<SDValue, 16> ConcatOps(NumConcat);
SDValue UndefVal = DAG.getUNDEF(NewVT);
ConcatOps[0] = LdOp;
for (unsigned i = 1; i != NumConcat; ++i)
ConcatOps[i] = UndefVal;
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
// Load vector by using multiple loads from largest vector to scalar.
SmallVector<SDValue, 16> LdOps;
LdWidth -= NewVTWidth;
unsigned Offset = 0;
while (LdWidth > 0) {
unsigned Increment = NewVTWidth / 8;
Offset += Increment;
BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
SDValue L;
if (LdWidth < NewVTWidth) {
// The current type we are using is too large. Find a better size.
NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
NewVTWidth = NewVT.getSizeInBits();
L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
LD->getOriginalAlign(), MMOFlags, AAInfo);
- if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
- // Later code assumes the vector loads produced will be mergeable, so we
- // must pad the final entry up to the previous width. Scalars are
- // combined separately.
- SmallVector<SDValue, 16> Loads;
- Loads.push_back(L);
- unsigned size = L->getValueSizeInBits(0);
- while (size < LdOp->getValueSizeInBits(0)) {
- Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
- size += L->getValueSizeInBits(0);
- }
- L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
- }
} else {
L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
LD->getOriginalAlign(), MMOFlags, AAInfo);
LdOp = L;
LdWidth -= NewVTWidth;
// Build the vector from the load operations.
unsigned End = LdOps.size();
if (!LdOps[0].getValueType().isVector())
// All the loads are scalar loads.
return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
// If the load contains vectors, build the vector using concat vector.
// All of the vectors used to load are power-of-2, and the scalar loads can be
// combined to make a power-of-2 vector.
SmallVector<SDValue, 16> ConcatOps(End);
int i = End - 1;
int Idx = End;
EVT LdTy = LdOps[i].getValueType();
// First, combine the scalar loads to a vector.
if (!LdTy.isVector()) {
for (--i; i >= 0; --i) {
LdTy = LdOps[i].getValueType();
if (LdTy.isVector())
ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
ConcatOps[--Idx] = LdOps[i];
for (--i; i >= 0; --i) {
EVT NewLdTy = LdOps[i].getValueType();
if (NewLdTy != LdTy) {
// Create a larger vector.
+ unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
+ assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
+ SmallVector<SDValue, 16> WidenOps(NumOps);
+ unsigned j = 0;
+ for (; j != End-Idx; ++j)
+ WidenOps[j] = ConcatOps[Idx+j];
+ for (; j != NumOps; ++j)
+ WidenOps[j] = DAG.getUNDEF(LdTy);
ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
- makeArrayRef(&ConcatOps[Idx], End - Idx));
+ WidenOps);
Idx = End - 1;
LdTy = NewLdTy;
ConcatOps[--Idx] = LdOps[i];
if (WidenWidth == LdTy.getSizeInBits() * (End - Idx))
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
makeArrayRef(&ConcatOps[Idx], End - Idx));
// We need to fill the rest with undefs to build the vector.
unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
SmallVector<SDValue, 16> WidenOps(NumOps);
SDValue UndefVal = DAG.getUNDEF(LdTy);
unsigned i = 0;
for (; i != End-Idx; ++i)
WidenOps[i] = ConcatOps[Idx+i];
for (; i != NumOps; ++i)
WidenOps[i] = UndefVal;
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, WidenOps);
DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
LoadSDNode *LD,
ISD::LoadExtType ExtType) {
// For extension loads, it may not be more efficient to chop up the vector
// and then extend it. Instead, we unroll the load and build a new vector.
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
EVT LdVT = LD->getMemoryVT();
SDLoc dl(LD);
assert(LdVT.isVector() && WidenVT.isVector());
// Load information
SDValue Chain = LD->getChain();
SDValue BasePtr = LD->getBasePtr();
MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
AAMDNodes AAInfo = LD->getAAInfo();
EVT EltVT = WidenVT.getVectorElementType();
EVT LdEltVT = LdVT.getVectorElementType();
unsigned NumElts = LdVT.getVectorNumElements();
// Load each element and widen.
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SmallVector<SDValue, 16> Ops(WidenNumElts);
unsigned Increment = LdEltVT.getSizeInBits() / 8;
Ops[0] =
DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(),
LdEltVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
unsigned i = 0, Offset = Increment;
for (i=1; i < NumElts; ++i, Offset += Increment) {
SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
LD->getOriginalAlign(), MMOFlags, AAInfo);
// Fill the rest with undefs.
SDValue UndefVal = DAG.getUNDEF(EltVT);
for (; i != WidenNumElts; ++i)
Ops[i] = UndefVal;
return DAG.getBuildVector(WidenVT, dl, Ops);
void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
StoreSDNode *ST) {
// The strategy assumes that we can efficiently store power-of-two widths.
// The routine chops the vector into the largest vector stores with the same
// element type or scalar stores.
SDValue Chain = ST->getChain();
SDValue BasePtr = ST->getBasePtr();
MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
AAMDNodes AAInfo = ST->getAAInfo();
SDValue ValOp = GetWidenedVector(ST->getValue());
SDLoc dl(ST);
EVT StVT = ST->getMemoryVT();
unsigned StWidth = StVT.getSizeInBits();
EVT ValVT = ValOp.getValueType();
unsigned ValWidth = ValVT.getSizeInBits();
EVT ValEltVT = ValVT.getVectorElementType();
unsigned ValEltWidth = ValEltVT.getSizeInBits();
assert(StVT.getVectorElementType() == ValEltVT);
int Idx = 0; // current index to store
unsigned Offset = 0; // offset from base to store
while (StWidth != 0) {
// Find the largest vector type we can store with.
EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
unsigned NewVTWidth = NewVT.getSizeInBits();
unsigned Increment = NewVTWidth / 8;
if (NewVT.isVector()) {
unsigned NumVTElts = NewVT.getVectorNumElements();
do {
SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
DAG.getVectorIdxConstant(Idx, dl));
Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
ST->getOriginalAlign(), MMOFlags, AAInfo));
StWidth -= NewVTWidth;
Offset += Increment;
Idx += NumVTElts;
BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
} while (StWidth != 0 && StWidth >= NewVTWidth);
} else {
// Cast the vector to the scalar type we can store.
unsigned NumElts = ValWidth / NewVTWidth;
EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
// Readjust index position based on new vector type.
Idx = Idx * ValEltWidth / NewVTWidth;
do {
SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
DAG.getVectorIdxConstant(Idx++, dl));
Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
ST->getOriginalAlign(), MMOFlags, AAInfo));
StWidth -= NewVTWidth;
Offset += Increment;
BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
} while (StWidth != 0 && StWidth >= NewVTWidth);
// Restore index back to be relative to the original widen element type.
Idx = Idx * NewVTWidth / ValEltWidth;
DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
StoreSDNode *ST) {
// For extension loads, it may not be more efficient to truncate the vector
// and then store it. Instead, we extract each element and then store it.
SDValue Chain = ST->getChain();
SDValue BasePtr = ST->getBasePtr();
MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
AAMDNodes AAInfo = ST->getAAInfo();
SDValue ValOp = GetWidenedVector(ST->getValue());
SDLoc dl(ST);
EVT StVT = ST->getMemoryVT();
EVT ValVT = ValOp.getValueType();
// It must be true that the wide vector type is bigger than where we need to
// store.
assert(StVT.isVector() && ValOp.getValueType().isVector());
// For truncating stores, we can not play the tricks of chopping legal vector
// types and bitcast it to the right type. Instead, we unroll the store.
EVT StEltVT = StVT.getVectorElementType();
EVT ValEltVT = ValVT.getVectorElementType();
unsigned Increment = ValEltVT.getSizeInBits() / 8;
unsigned NumElts = StVT.getVectorNumElements();
SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
DAG.getVectorIdxConstant(0, dl));
DAG.getTruncStore(Chain, dl, EOp, BasePtr, ST->getPointerInfo(), StEltVT,
ST->getOriginalAlign(), MMOFlags, AAInfo));
unsigned Offset = Increment;
for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
DAG.getVectorIdxConstant(0, dl));
Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset),
StEltVT, ST->getOriginalAlign(), MMOFlags, AAInfo));
/// Modifies a vector input (widen or narrows) to a vector of NVT. The
/// input vector must have the same element type as NVT.
/// FillWithZeroes specifies that the vector should be widened with zeroes.
SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
bool FillWithZeroes) {
// Note that InOp might have been widened so it might already have
// the right width or it might need be narrowed.
EVT InVT = InOp.getValueType();
assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match");
SDLoc dl(InOp);
// Check if InOp already has the right width.
if (InVT == NVT)
return InOp;
unsigned InNumElts = InVT.getVectorNumElements();
unsigned WidenNumElts = NVT.getVectorNumElements();
if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
unsigned NumConcat = WidenNumElts / InNumElts;
SmallVector<SDValue, 16> Ops(NumConcat);
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
Ops[0] = InOp;
for (unsigned i = 1; i != NumConcat; ++i)
Ops[i] = FillVal;
return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
DAG.getVectorIdxConstant(0, dl));
// Fall back to extract and build.
SmallVector<SDValue, 16> Ops(WidenNumElts);
EVT EltVT = NVT.getVectorElementType();
unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
unsigned Idx;
for (Idx = 0; Idx < MinNumElts; ++Idx)
Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
DAG.getVectorIdxConstant(Idx, dl));
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
for ( ; Idx < WidenNumElts; ++Idx)
Ops[Idx] = FillVal;
return DAG.getBuildVector(NVT, dl, Ops);
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 4796ef531054..8e7bf1eb0169 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -1,1174 +1,1175 @@
//===- llvm/MC/WinCOFFObjectWriter.cpp ------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file contains an implementation of a Win32 COFF object file writer.
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/MC/MCAsmLayout.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCFragment.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSectionCOFF.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolCOFF.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/MCWinCOFFObjectWriter.h"
#include "llvm/MC/StringTableBuilder.h"
#include "llvm/Support/CRC.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <ctime>
#include <memory>
#include <string>
#include <vector>
using namespace llvm;
using llvm::support::endian::write32le;
#define DEBUG_TYPE "WinCOFFObjectWriter"
namespace {
using name = SmallString<COFF::NameSize>;
enum AuxiliaryType {
struct AuxSymbol {
AuxiliaryType AuxType;
COFF::Auxiliary Aux;
class COFFSection;
class COFFSymbol {
COFF::symbol Data = {};
using AuxiliarySymbols = SmallVector<AuxSymbol, 1>;
name Name;
int Index;
AuxiliarySymbols Aux;
COFFSymbol *Other = nullptr;
COFFSection *Section = nullptr;
int Relocations = 0;
const MCSymbol *MC = nullptr;
COFFSymbol(StringRef Name) : Name(Name) {}
void set_name_offset(uint32_t Offset);
int64_t getIndex() const { return Index; }
void setIndex(int Value) {
Index = Value;
if (MC)
// This class contains staging data for a COFF relocation entry.
struct COFFRelocation {
COFF::relocation Data;
COFFSymbol *Symb = nullptr;
COFFRelocation() = default;
static size_t size() { return COFF::RelocationSize; }
using relocations = std::vector<COFFRelocation>;
class COFFSection {
COFF::section Header = {};
std::string Name;
int Number;
MCSectionCOFF const *MCSection = nullptr;
COFFSymbol *Symbol = nullptr;
relocations Relocations;
COFFSection(StringRef Name) : Name(std::string(Name)) {}
class WinCOFFObjectWriter : public MCObjectWriter {
support::endian::Writer W;
using symbols = std::vector<std::unique_ptr<COFFSymbol>>;
using sections = std::vector<std::unique_ptr<COFFSection>>;
using symbol_map = DenseMap<MCSymbol const *, COFFSymbol *>;
using section_map = DenseMap<MCSection const *, COFFSection *>;
using symbol_list = DenseSet<COFFSymbol *>;
std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
// Root level file contents.
COFF::header Header = {};
sections Sections;
symbols Symbols;
StringTableBuilder Strings{StringTableBuilder::WinCOFF};
// Maps used during object file creation.
section_map SectionMap;
symbol_map SymbolMap;
symbol_list WeakDefaults;
bool UseBigObj;
bool EmitAddrsigSection = false;
MCSectionCOFF *AddrsigSection;
std::vector<const MCSymbol *> AddrsigSyms;
MCSectionCOFF *CGProfileSection = nullptr;
WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
raw_pwrite_stream &OS);
void reset() override {
memset(&Header, 0, sizeof(Header));
Header.Machine = TargetObjectWriter->getMachine();
COFFSymbol *createSymbol(StringRef Name);
COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol *Symbol);
COFFSection *createSection(StringRef Name);
void defineSection(MCSectionCOFF const &Sec);
COFFSymbol *getLinkedSymbol(const MCSymbol &Symbol);
void DefineSymbol(const MCSymbol &Symbol, MCAssembler &Assembler,
const MCAsmLayout &Layout);
void SetSymbolName(COFFSymbol &S);
void SetSectionName(COFFSection &S);
bool IsPhysicalSection(COFFSection *S);
// Entity writing methods.
void WriteFileHeader(const COFF::header &Header);
void WriteSymbol(const COFFSymbol &S);
void WriteAuxiliarySymbols(const COFFSymbol::AuxiliarySymbols &S);
void writeSectionHeaders();
void WriteRelocation(const COFF::relocation &R);
uint32_t writeSectionContents(MCAssembler &Asm, const MCAsmLayout &Layout,
const MCSection &MCSec);
void writeSection(MCAssembler &Asm, const MCAsmLayout &Layout,
const COFFSection &Sec, const MCSection &MCSec);
// MCObjectWriter interface implementation.
void executePostLayoutBinding(MCAssembler &Asm,
const MCAsmLayout &Layout) override;
bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
const MCSymbol &SymA,
const MCFragment &FB, bool InSet,
bool IsPCRel) const override;
void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
const MCFragment *Fragment, const MCFixup &Fixup,
MCValue Target, uint64_t &FixedValue) override;
void createFileSymbols(MCAssembler &Asm);
void setWeakDefaultNames();
void assignSectionNumbers();
void assignFileOffsets(MCAssembler &Asm, const MCAsmLayout &Layout);
void emitAddrsigSection() override { EmitAddrsigSection = true; }
void addAddrsigSymbol(const MCSymbol *Sym) override {
uint64_t writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
} // end anonymous namespace
// Symbol class implementation
// In the case that the name does not fit within 8 bytes, the offset
// into the string table is stored in the last 4 bytes instead, leaving
// the first 4 bytes as 0.
void COFFSymbol::set_name_offset(uint32_t Offset) {
write32le(Data.Name + 0, 0);
write32le(Data.Name + 4, Offset);
// WinCOFFObjectWriter class implementation
std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS)
: W(OS, support::little), TargetObjectWriter(std::move(MOTW)) {
Header.Machine = TargetObjectWriter->getMachine();
COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
return Symbols.back().get();
COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol *Symbol) {
COFFSymbol *&Ret = SymbolMap[Symbol];
if (!Ret)
Ret = createSymbol(Symbol->getName());
return Ret;
COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
return Sections.back().get();
static uint32_t getAlignment(const MCSectionCOFF &Sec) {
switch (Sec.getAlignment()) {
case 1:
case 2:
case 4:
case 8:
case 16:
case 32:
case 64:
case 128:
case 256:
case 512:
case 1024:
case 2048:
case 4096:
case 8192:
llvm_unreachable("unsupported section alignment");
/// This function takes a section data object from the assembler
/// and creates the associated COFF section staging object.
void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) {
COFFSection *Section = createSection(MCSec.getName());
COFFSymbol *Symbol = createSymbol(MCSec.getName());
Section->Symbol = Symbol;
Symbol->Section = Section;
Symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
// Create a COMDAT symbol if needed.
if (const MCSymbol *S = MCSec.getCOMDATSymbol()) {
COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
if (COMDATSymbol->Section)
report_fatal_error("two sections have the same comdat");
COMDATSymbol->Section = Section;
// In this case the auxiliary symbol is a Section Definition.
Symbol->Aux[0] = {};
Symbol->Aux[0].AuxType = ATSectionDefinition;
Symbol->Aux[0].Aux.SectionDefinition.Selection = MCSec.getSelection();
// Set section alignment.
Section->Header.Characteristics = MCSec.getCharacteristics();
Section->Header.Characteristics |= getAlignment(MCSec);
// Bind internal COFF section to MC section.
Section->MCSection = &MCSec;
SectionMap[&MCSec] = Section;
static uint64_t getSymbolValue(const MCSymbol &Symbol,
const MCAsmLayout &Layout) {
if (Symbol.isCommon() && Symbol.isExternal())
return Symbol.getCommonSize();
uint64_t Res;
if (!Layout.getSymbolOffset(Symbol, Res))
return 0;
return Res;
COFFSymbol *WinCOFFObjectWriter::getLinkedSymbol(const MCSymbol &Symbol) {
if (!Symbol.isVariable())
return nullptr;
const MCSymbolRefExpr *SymRef =
if (!SymRef)
return nullptr;
const MCSymbol &Aliasee = SymRef->getSymbol();
if (!Aliasee.isUndefined())
return nullptr;
return GetOrCreateCOFFSymbol(&Aliasee);
/// This function takes a symbol data object from the assembler
/// and creates the associated COFF symbol staging object.
void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
MCAssembler &Assembler,
const MCAsmLayout &Layout) {
COFFSymbol *Sym = GetOrCreateCOFFSymbol(&MCSym);
const MCSymbol *Base = Layout.getBaseSymbol(MCSym);
COFFSection *Sec = nullptr;
if (Base && Base->getFragment()) {
Sec = SectionMap[Base->getFragment()->getParent()];
if (Sym->Section && Sym->Section != Sec)
report_fatal_error("conflicting sections for symbol");
COFFSymbol *Local = nullptr;
if (cast<MCSymbolCOFF>(MCSym).isWeakExternal()) {
+ Sym->Section = nullptr;
COFFSymbol *WeakDefault = getLinkedSymbol(MCSym);
if (!WeakDefault) {
std::string WeakName = (".weak." + MCSym.getName() + ".default").str();
WeakDefault = createSymbol(WeakName);
if (!Sec)
WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
WeakDefault->Section = Sec;
Local = WeakDefault;
Sym->Other = WeakDefault;
// Setup the Weak External auxiliary symbol.
memset(&Sym->Aux[0], 0, sizeof(Sym->Aux[0]));
Sym->Aux[0].AuxType = ATWeakExternal;
Sym->Aux[0].Aux.WeakExternal.TagIndex = 0;
Sym->Aux[0].Aux.WeakExternal.Characteristics =
} else {
if (!Base)
Sym->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
Sym->Section = Sec;
Local = Sym;
if (Local) {
Local->Data.Value = getSymbolValue(MCSym, Layout);
const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(MCSym);
Local->Data.Type = SymbolCOFF.getType();
Local->Data.StorageClass = SymbolCOFF.getClass();
// If no storage class was specified in the streamer, define it here.
if (Local->Data.StorageClass == COFF::IMAGE_SYM_CLASS_NULL) {
bool IsExternal = MCSym.isExternal() ||
(!MCSym.getFragment() && !MCSym.isVariable());
Local->Data.StorageClass = IsExternal ? COFF::IMAGE_SYM_CLASS_EXTERNAL
Sym->MC = &MCSym;
// Maximum offsets for different string table entry encodings.
enum : unsigned { Max7DecimalOffset = 9999999U };
enum : uint64_t { MaxBase64Offset = 0xFFFFFFFFFULL }; // 64^6, including 0
// Encode a string table entry offset in base 64, padded to 6 chars, and
// prefixed with a double slash: '//AAAAAA', '//AAAAAB', ...
// Buffer must be at least 8 bytes large. No terminating null appended.
static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
assert(Value > Max7DecimalOffset && Value <= MaxBase64Offset &&
"Illegal section name encoding for value");
static const char Alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
Buffer[0] = '/';
Buffer[1] = '/';
char *Ptr = Buffer + 7;
for (unsigned i = 0; i < 6; ++i) {
unsigned Rem = Value % 64;
Value /= 64;
*(Ptr--) = Alphabet[Rem];
void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
if (S.Name.size() <= COFF::NameSize) {
std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
uint64_t StringTableEntry = Strings.getOffset(S.Name);
if (StringTableEntry <= Max7DecimalOffset) {
SmallVector<char, COFF::NameSize> Buffer;
assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
std::memcpy(S.Header.Name,, Buffer.size());
if (StringTableEntry <= MaxBase64Offset) {
// Starting with 10,000,000, offsets are encoded as base64.
encodeBase64StringEntry(S.Header.Name, StringTableEntry);
report_fatal_error("COFF string table is greater than 64 GB.");
void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
if (S.Name.size() > COFF::NameSize)
std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
bool WinCOFFObjectWriter::IsPhysicalSection(COFFSection *S) {
return (S->Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) ==
// entity writing methods
void WinCOFFObjectWriter::WriteFileHeader(const COFF::header &Header) {
if (UseBigObj) {
W.OS.write(COFF::BigObjMagic, sizeof(COFF::BigObjMagic));
} else {
void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol &S) {
W.OS.write(S.Data.Name, COFF::NameSize);
if (UseBigObj)
W.OS << char(S.Data.StorageClass);
W.OS << char(S.Data.NumberOfAuxSymbols);
void WinCOFFObjectWriter::WriteAuxiliarySymbols(
const COFFSymbol::AuxiliarySymbols &S) {
for (const AuxSymbol &i : S) {
switch (i.AuxType) {
case ATWeakExternal:
if (UseBigObj)
W.OS.write_zeros(COFF::Symbol32Size - COFF::Symbol16Size);
case ATFile:
W.OS.write(reinterpret_cast<const char *>(&i.Aux),
UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size);
case ATSectionDefinition:
W.OS << char(i.Aux.SectionDefinition.Selection);
W.write<uint16_t>(static_cast<int16_t>(i.Aux.SectionDefinition.Number >> 16));
if (UseBigObj)
W.OS.write_zeros(COFF::Symbol32Size - COFF::Symbol16Size);
// Write the section header.
void WinCOFFObjectWriter::writeSectionHeaders() {
// Section numbers must be monotonically increasing in the section
// header, but our Sections array is not sorted by section number,
// so make a copy of Sections and sort it.
std::vector<COFFSection *> Arr;
for (auto &Section : Sections)
llvm::sort(Arr, [](const COFFSection *A, const COFFSection *B) {
return A->Number < B->Number;
for (auto &Section : Arr) {
if (Section->Number == -1)
COFF::section &S = Section->Header;
if (Section->Relocations.size() >= 0xffff)
W.OS.write(S.Name, COFF::NameSize);
void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
// Write MCSec's contents. What this function does is essentially
// "Asm.writeSectionData(&MCSec, Layout)", but it's a bit complicated
// because it needs to compute a CRC.
uint32_t WinCOFFObjectWriter::writeSectionContents(MCAssembler &Asm,
const MCAsmLayout &Layout,
const MCSection &MCSec) {
// Save the contents of the section to a temporary buffer, we need this
// to CRC the data before we dump it into the object file.
SmallVector<char, 128> Buf;
raw_svector_ostream VecOS(Buf);
Asm.writeSectionData(VecOS, &MCSec, Layout);
// Write the section contents to the object file.
W.OS << Buf;
// Calculate our CRC with an initial value of '0', this is not how
// JamCRC is specified but it aligns with the expected output.
JamCRC JC(/*Init=*/0);
JC.update(makeArrayRef(reinterpret_cast<uint8_t*>(, Buf.size()));
return JC.getCRC();
void WinCOFFObjectWriter::writeSection(MCAssembler &Asm,
const MCAsmLayout &Layout,
const COFFSection &Sec,
const MCSection &MCSec) {
if (Sec.Number == -1)
// Write the section contents.
if (Sec.Header.PointerToRawData != 0) {
assert(W.OS.tell() == Sec.Header.PointerToRawData &&
"Section::PointerToRawData is insane!");
uint32_t CRC = writeSectionContents(Asm, Layout, MCSec);
// Update the section definition auxiliary symbol to record the CRC.
COFFSection *Sec = SectionMap[&MCSec];
COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition);
AuxSymbol &SecDef = AuxSyms[0];
SecDef.Aux.SectionDefinition.CheckSum = CRC;
// Write relocations for this section.
if (Sec.Relocations.empty()) {
assert(Sec.Header.PointerToRelocations == 0 &&
"Section::PointerToRelocations is insane!");
assert(W.OS.tell() == Sec.Header.PointerToRelocations &&
"Section::PointerToRelocations is insane!");
if (Sec.Relocations.size() >= 0xffff) {
// In case of overflow, write actual relocation count as first
// relocation. Including the synthetic reloc itself (+ 1).
COFF::relocation R;
R.VirtualAddress = Sec.Relocations.size() + 1;
R.SymbolTableIndex = 0;
R.Type = 0;
for (const auto &Relocation : Sec.Relocations)
// MCObjectWriter interface implementations
void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
const MCAsmLayout &Layout) {
if (EmitAddrsigSection) {
AddrsigSection = Asm.getContext().getCOFFSection(
".llvm_addrsig", COFF::IMAGE_SCN_LNK_REMOVE,
if (!Asm.CGProfile.empty()) {
CGProfileSection = Asm.getContext().getCOFFSection(
// "Define" each section & symbol. This creates section & symbol
// entries in the staging area.
for (const auto &Section : Asm)
defineSection(static_cast<const MCSectionCOFF &>(Section));
for (const MCSymbol &Symbol : Asm.symbols())
if (!Symbol.isTemporary())
DefineSymbol(Symbol, Asm, Layout);
bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
const MCAssembler &Asm, const MCSymbol &SymA, const MCFragment &FB,
bool InSet, bool IsPCRel) const {
// Don't drop relocations between functions, even if they are in the same text
// section. Multiple Visual C++ linker features depend on having the
// relocations present. The /INCREMENTAL flag will cause these relocations to
// point to thunks, and the /GUARD:CF flag assumes that it can use relocations
// to approximate the set of all address taken functions. LLD's implementation
// of /GUARD:CF also relies on the existance of these relocations.
uint16_t Type = cast<MCSymbolCOFF>(SymA).getType();
return false;
return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
InSet, IsPCRel);
void WinCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
const MCAsmLayout &Layout,
const MCFragment *Fragment,
const MCFixup &Fixup, MCValue Target,
uint64_t &FixedValue) {
assert(Target.getSymA() && "Relocation must reference a symbol!");
const MCSymbol &A = Target.getSymA()->getSymbol();
if (!A.isRegistered()) {
Twine("symbol '") + A.getName() +
"' can not be undefined");
if (A.isTemporary() && A.isUndefined()) {
Twine("assembler label '") + A.getName() +
"' can not be undefined");
MCSection *MCSec = Fragment->getParent();
// Mark this symbol as requiring an entry in the symbol table.
assert(SectionMap.find(MCSec) != SectionMap.end() &&
"Section must already have been defined in executePostLayoutBinding!");
COFFSection *Sec = SectionMap[MCSec];
const MCSymbolRefExpr *SymB = Target.getSymB();
if (SymB) {
const MCSymbol *B = &SymB->getSymbol();
if (!B->getFragment()) {
Twine("symbol '") + B->getName() +
"' can not be undefined in a subtraction expression");
// Offset of the symbol in the section
int64_t OffsetOfB = Layout.getSymbolOffset(*B);
// Offset of the relocation in the section
int64_t OffsetOfRelocation =
Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
FixedValue = (OffsetOfRelocation - OffsetOfB) + Target.getConstant();
} else {
FixedValue = Target.getConstant();
COFFRelocation Reloc;
Reloc.Data.SymbolTableIndex = 0;
Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment);
// Turn relocations for temporary symbols into section relocations.
if (A.isTemporary()) {
MCSection *TargetSection = &A.getSection();
SectionMap.find(TargetSection) != SectionMap.end() &&
"Section must already have been defined in executePostLayoutBinding!");
Reloc.Symb = SectionMap[TargetSection]->Symbol;
FixedValue += Layout.getSymbolOffset(A);
} else {
SymbolMap.find(&A) != SymbolMap.end() &&
"Symbol must already have been defined in executePostLayoutBinding!");
Reloc.Symb = SymbolMap[&A];
Reloc.Data.VirtualAddress += Fixup.getOffset();
Reloc.Data.Type = TargetObjectWriter->getRelocType(
Asm.getContext(), Target, Fixup, SymB, Asm.getBackend());
// FIXME: Can anyone explain what this does other than adjust for the size
// of the offset?
if ((Header.Machine == COFF::IMAGE_FILE_MACHINE_AMD64 &&
Reloc.Data.Type == COFF::IMAGE_REL_AMD64_REL32) ||
(Header.Machine == COFF::IMAGE_FILE_MACHINE_I386 &&
Reloc.Data.Type == COFF::IMAGE_REL_I386_REL32))
FixedValue += 4;
if (Header.Machine == COFF::IMAGE_FILE_MACHINE_ARMNT) {
switch (Reloc.Data.Type) {
// IMAGE_REL_ARM_BRANCH11 and IMAGE_REL_ARM_BLX11 are only used for
// pre-ARMv7, which implicitly rules it out of ARMNT (it would be valid
// for Windows CE).
// only used for ARM mode code, which is documented as being unsupported
// by Windows on ARM. Empirical proof indicates that masm is able to
// generate the relocations however the rest of the MSVC toolchain is
// unable to handle it.
llvm_unreachable("unsupported relocation");
// perform a 4 byte adjustment to the relocation. Relative branches are
// offset by 4 on ARM, however, because there is no RELA relocations, all
// branches are offset by 4.
FixedValue = FixedValue + 4;
// The fixed value never makes sense for section indices, ignore it.
if (Fixup.getKind() == FK_SecRel_2)
FixedValue = 0;
if (TargetObjectWriter->recordRelocation(Fixup))
static std::time_t getTime() {
std::time_t Now = time(nullptr);
if (Now < 0 || !isUInt<32>(Now))
return UINT32_MAX;
return Now;
// Create .file symbols.
void WinCOFFObjectWriter::createFileSymbols(MCAssembler &Asm) {
for (const std::string &Name : Asm.getFileNames()) {
// round up to calculate the number of auxiliary symbols required
unsigned SymbolSize = UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size;
unsigned Count = (Name.size() + SymbolSize - 1) / SymbolSize;
COFFSymbol *File = createSymbol(".file");
File->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
File->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
unsigned Offset = 0;
unsigned Length = Name.size();
for (auto &Aux : File->Aux) {
Aux.AuxType = ATFile;
if (Length > SymbolSize) {
memcpy(&Aux.Aux, Name.c_str() + Offset, SymbolSize);
Length = Length - SymbolSize;
} else {
memcpy(&Aux.Aux, Name.c_str() + Offset, Length);
memset((char *)&Aux.Aux + Length, 0, SymbolSize - Length);
Offset += SymbolSize;
void WinCOFFObjectWriter::setWeakDefaultNames() {
if (WeakDefaults.empty())
// If multiple object files use a weak symbol (either with a regular
// defined default, or an absolute zero symbol as default), the defaults
// cause duplicate definitions unless their names are made unique. Look
// for a defined extern symbol, that isn't comdat - that should be unique
// unless there are other duplicate definitions. And if none is found,
// allow picking a comdat symbol, as that's still better than nothing.
COFFSymbol *Unique = nullptr;
for (bool AllowComdat : {false, true}) {
for (auto &Sym : Symbols) {
// Don't include the names of the defaults themselves
if (WeakDefaults.count(Sym.get()))
// Only consider external symbols
if (Sym->Data.StorageClass != COFF::IMAGE_SYM_CLASS_EXTERNAL)
// Only consider symbols defined in a section or that are absolute
if (!Sym->Section && Sym->Data.SectionNumber != COFF::IMAGE_SYM_ABSOLUTE)
if (!AllowComdat && Sym->Section &&
Sym->Section->Header.Characteristics & COFF::IMAGE_SCN_LNK_COMDAT)
Unique = Sym.get();
if (Unique)
// If we didn't find any unique symbol to use for the names, just skip this.
if (!Unique)
for (auto *Sym : WeakDefaults) {
static bool isAssociative(const COFFSection &Section) {
return Section.Symbol->Aux[0].Aux.SectionDefinition.Selection ==
void WinCOFFObjectWriter::assignSectionNumbers() {
size_t I = 1;
auto Assign = [&](COFFSection &Section) {
Section.Number = I;
Section.Symbol->Data.SectionNumber = I;
Section.Symbol->Aux[0].Aux.SectionDefinition.Number = I;
// Although it is not explicitly requested by the Microsoft COFF spec,
// we should avoid emitting forward associative section references,
// because MSVC link.exe as of 2017 cannot handle that.
for (const std::unique_ptr<COFFSection> &Section : Sections)
if (!isAssociative(*Section))
for (const std::unique_ptr<COFFSection> &Section : Sections)
if (isAssociative(*Section))
// Assign file offsets to COFF object file structures.
void WinCOFFObjectWriter::assignFileOffsets(MCAssembler &Asm,
const MCAsmLayout &Layout) {
unsigned Offset = W.OS.tell();
Offset += UseBigObj ? COFF::Header32Size : COFF::Header16Size;
Offset += COFF::SectionSize * Header.NumberOfSections;
for (const auto &Section : Asm) {
COFFSection *Sec = SectionMap[&Section];
if (Sec->Number == -1)
Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
if (IsPhysicalSection(Sec)) {
Sec->Header.PointerToRawData = Offset;
Offset += Sec->Header.SizeOfRawData;
if (!Sec->Relocations.empty()) {
bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
if (RelocationsOverflow) {
// Signal overflow by setting NumberOfRelocations to max value. Actual
// size is found in reloc #0. Microsoft tools understand this.
Sec->Header.NumberOfRelocations = 0xffff;
} else {
Sec->Header.NumberOfRelocations = Sec->Relocations.size();
Sec->Header.PointerToRelocations = Offset;
if (RelocationsOverflow) {
// Reloc #0 will contain actual count, so make room for it.
Offset += COFF::RelocationSize;
Offset += COFF::RelocationSize * Sec->Relocations.size();
for (auto &Relocation : Sec->Relocations) {
assert(Relocation.Symb->getIndex() != -1);
Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
assert(Sec->Symbol->Aux.size() == 1 &&
"Section's symbol must have one aux!");
AuxSymbol &Aux = Sec->Symbol->Aux[0];
assert(Aux.AuxType == ATSectionDefinition &&
"Section's symbol's aux symbol must be a Section Definition!");
Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
Aux.Aux.SectionDefinition.NumberOfRelocations =
Aux.Aux.SectionDefinition.NumberOfLinenumbers =
Header.PointerToSymbolTable = Offset;
uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
const MCAsmLayout &Layout) {
uint64_t StartOffset = W.OS.tell();
if (Sections.size() > INT32_MAX)
"PE COFF object files can't have more than 2147483647 sections");
UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
Header.NumberOfSections = Sections.size();
Header.NumberOfSymbols = 0;
for (auto &Symbol : Symbols) {
// Update section number & offset for symbols that have them.
if (Symbol->Section)
Symbol->Data.SectionNumber = Symbol->Section->Number;
// Update auxiliary symbol info.
Symbol->Data.NumberOfAuxSymbols = Symbol->Aux.size();
Header.NumberOfSymbols += Symbol->Data.NumberOfAuxSymbols;
// Build string table.
for (const auto &S : Sections)
if (S->Name.size() > COFF::NameSize)
for (const auto &S : Symbols)
if (S->Name.size() > COFF::NameSize)
// Set names.
for (const auto &S : Sections)
for (auto &S : Symbols)
// Fixup weak external references.
for (auto &Symbol : Symbols) {
if (Symbol->Other) {
assert(Symbol->getIndex() != -1);
assert(Symbol->Aux.size() == 1 && "Symbol must contain one aux symbol!");
assert(Symbol->Aux[0].AuxType == ATWeakExternal &&
"Symbol's aux symbol must be a Weak External!");
Symbol->Aux[0].Aux.WeakExternal.TagIndex = Symbol->Other->getIndex();
// Fixup associative COMDAT sections.
for (auto &Section : Sections) {
if (Section->Symbol->Aux[0].Aux.SectionDefinition.Selection !=
const MCSectionCOFF &MCSec = *Section->MCSection;
const MCSymbol *AssocMCSym = MCSec.getCOMDATSymbol();
// It's an error to try to associate with an undefined symbol or a symbol
// without a section.
if (!AssocMCSym->isInSection()) {
SMLoc(), Twine("cannot make section ") + MCSec.getName() +
Twine(" associative with sectionless symbol ") +
const auto *AssocMCSec = cast<MCSectionCOFF>(&AssocMCSym->getSection());
COFFSection *AssocSec = SectionMap[AssocMCSec];
// Skip this section if the associated section is unused.
if (AssocSec->Number == -1)
Section->Symbol->Aux[0].Aux.SectionDefinition.Number = AssocSec->Number;
// Create the contents of the .llvm_addrsig section.
if (EmitAddrsigSection) {
auto Frag = new MCDataFragment(AddrsigSection);
raw_svector_ostream OS(Frag->getContents());
for (const MCSymbol *S : AddrsigSyms) {
if (!S->isTemporary()) {
encodeULEB128(S->getIndex(), OS);
MCSection *TargetSection = &S->getSection();
assert(SectionMap.find(TargetSection) != SectionMap.end() &&
"Section must already have been defined in "
encodeULEB128(SectionMap[TargetSection]->Symbol->getIndex(), OS);
// Create the contents of the section.
if (CGProfileSection) {
auto *Frag = new MCDataFragment(CGProfileSection);
raw_svector_ostream OS(Frag->getContents());
for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
support::endian::write(OS, FromIndex, W.Endian);
support::endian::write(OS, ToIndex, W.Endian);
support::endian::write(OS, CGPE.Count, W.Endian);
assignFileOffsets(Asm, Layout);
// MS LINK expects to be able to use this timestamp to implement their
// /INCREMENTAL feature.
if (Asm.isIncrementalLinkerCompatible()) {
Header.TimeDateStamp = getTime();
} else {
// Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU.
Header.TimeDateStamp = 0;
// Write it all to disk...
// Write section contents.
sections::iterator I = Sections.begin();
sections::iterator IE = Sections.end();
MCAssembler::iterator J = Asm.begin();
MCAssembler::iterator JE = Asm.end();
for (; I != IE && J != JE; ++I, ++J)
writeSection(Asm, Layout, **I, *J);
assert(W.OS.tell() == Header.PointerToSymbolTable &&
"Header::PointerToSymbolTable is insane!");
// Write a symbol table.
for (auto &Symbol : Symbols)
if (Symbol->getIndex() != -1)
// Write a string table, which completes the entire COFF file.
return W.OS.tell() - StartOffset;
MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_)
: Machine(Machine_) {}
// Pin the vtable to this file.
void MCWinCOFFObjectTargetWriter::anchor() {}
// WinCOFFObjectWriter factory function
std::unique_ptr<MCObjectWriter> llvm::createWinCOFFObjectWriter(
std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW, raw_pwrite_stream &OS) {
return std::make_unique<WinCOFFObjectWriter>(std::move(MOTW), OS);
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index efa3fd5ca9ce..4789a9f02937 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1,3144 +1,3139 @@
//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file contains the AArch64 implementation of TargetFrameLowering class.
// On AArch64, stack frames are structured as follows:
// The stack grows downward.
// All of the individual frame areas on the frame below are optional, i.e. it's
// possible to create a function so that the particular area isn't present
// in the frame.
// At function entry, the "frame" looks as follows:
// | | Higher address
// |-----------------------------------|
// | |
// | arguments passed on the stack |
// | |
// |-----------------------------------| <- sp
// | | Lower address
// After the prologue has run, the frame has the following general structure.
// Note that this doesn't depict the case where a red-zone is used. Also,
// technically the last frame area (VLAs) doesn't get created until in the
// main function body, after the prologue is run. However, it's depicted here
// for completeness.
// | | Higher address
// |-----------------------------------|
// | |
// | arguments passed on the stack |
// | |
// |-----------------------------------|
// | |
// | (Win64 only) varargs from reg |
// | |
// |-----------------------------------|
// | |
// | callee-saved gpr registers | <--.
// | | | On Darwin platforms these
// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
// | | | (frame record first)
// | prev_fp, prev_lr | <--'
// | (a.k.a. "frame record") |
// |-----------------------------------| <- fp(=x29)
// | |
// | callee-saved fp/simd/SVE regs |
// | |
// |-----------------------------------|
// | |
// | SVE stack objects |
// | |
// |-----------------------------------|
// ||
// || (size of this area is unknown at
// |.the.standard.16-byte.alignment....| compile time; if present)
// |-----------------------------------|
// | |
// | local variables of fixed size |
// | including spill slots |
// |-----------------------------------| <- bp(not defined by ABI,
// |.variable-sized.local.variables....| LLVM chooses X19)
// |.(VLAs)............................| (size of this area is unknown at
// |...................................| compile time)
// |-----------------------------------| <- sp
// | | Lower address
// To access the data in a frame, at-compile time, a constant offset must be
// computable from one of the pointers (fp, bp, sp) to access it. The size
// of the areas with a dotted background cannot be computed at compile-time
// if they are present, making it required to have all three of fp, bp and
// sp to be set up to be able to access all contents in the frame areas,
// assuming all of the frame areas are non-empty.
// For most functions, some of the frame areas are empty. For those functions,
// it may not be necessary to set up fp or bp:
// * A base pointer is definitely needed when there are both VLAs and local
// variables with more-than-default alignment requirements.
// * A frame pointer is definitely needed when there are local variables with
// more-than-default alignment requirements.
// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
// callee-saved area, since the unwind encoding does not allow for encoding
// this dynamically and existing tools depend on this layout. For other
// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
// area to allow SVE stack objects (allocated directly below the callee-saves,
// if available) to be accessed directly from the framepointer.
// The SVE spill/fill instructions have VL-scaled addressing modes such
// as:
// ldr z8, [fp, #-7 mul vl]
// For SVE the size of the vector length (VL) is not known at compile-time, so
// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
// layout, we don't need to add an unscaled offset to the framepointer before
// accessing the SVE object in the frame.
// In some cases when a base pointer is not strictly needed, it is generated
// anyway when offsets from the frame pointer to access local variables become
// so large that the offset can't be encoded in the immediate fields of loads
// or stores.
// FIXME: also explain the redzone concept.
// FIXME: also explain the concept of reserved call frames.
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64StackOffset.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <cassert>
#include <cstdint>
#include <iterator>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "frame-info"
static cl::opt<bool> EnableRedZone("aarch64-redzone",
cl::desc("enable use of redzone on AArch64"),
cl::init(false), cl::Hidden);
static cl::opt<bool>
cl::desc("reverse the CSR restore sequence"),
cl::init(false), cl::Hidden);
static cl::opt<bool> StackTaggingMergeSetTag(
cl::desc("merge settag instruction in function epilog"), cl::init(true),
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
/// Returns the argument pop size.
static uint64_t getArgumentPopSize(MachineFunction &MF,
MachineBasicBlock &MBB) {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
bool IsTailCallReturn = false;
if (MBB.end() != MBBI) {
unsigned RetOpcode = MBBI->getOpcode();
IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
RetOpcode == AArch64::TCRETURNri ||
RetOpcode == AArch64::TCRETURNriBTI;
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
uint64_t ArgumentPopSize = 0;
if (IsTailCallReturn) {
MachineOperand &StackAdjust = MBBI->getOperand(1);
// For a tail-call in a callee-pops-arguments environment, some or all of
// the stack may actually be in use for the call's arguments, this is
// calculated during LowerCall and consumed here...
ArgumentPopSize = StackAdjust.getImm();
} else {
// ... otherwise the amount to pop is *all* of the argument space,
// conveniently stored in the MachineFunctionInfo by
// LowerFormalArguments. This will, of course, be zero for the C calling
// convention.
ArgumentPopSize = AFI->getArgumentStackToRestore();
return ArgumentPopSize;
/// This is the biggest offset to the stack pointer we can encode in aarch64
/// instructions (without using a separate calculation and a temp register).
/// Note that the exception here are vector stores/loads which cannot encode any
/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
static const unsigned DefaultSafeSPDisplacement = 255;
/// Look at each instruction that references stack frames and return the stack
/// size limit beyond which some of these instructions will require a scratch
/// register during their expansion later.
static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
// FIXME: For now, just conservatively guestimate based on unscaled indexing
// range. We'll end up allocating an unnecessary spill slot a lot, but
// realistically that's not a big deal at this stage of the game.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (MI.isDebugInstr() || MI.isPseudo() ||
MI.getOpcode() == AArch64::ADDXri ||
MI.getOpcode() == AArch64::ADDSXri)
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isFI())
StackOffset Offset;
if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
return 0;
return DefaultSafeSPDisplacement;
AArch64FrameLowering::getStackIDForScalableVectors() const {
return TargetStackID::SVEVector;
/// Returns the size of the fixed object area (allocated next to sp on entry)
/// On Win64 this may include a var args area and an UnwindHelp object for EH.
static unsigned getFixedObjectSize(const MachineFunction &MF,
const AArch64FunctionInfo *AFI, bool IsWin64,
bool IsFunclet) {
if (!IsWin64 || IsFunclet) {
// Only Win64 uses fixed objects, and then only for the function (not
// funclets)
return 0;
} else {
// Var args are stored here in the primary function.
const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
// To support EH funclets we allocate an UnwindHelp object
const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
return alignTo(VarArgsArea + UnwindHelpObject, 16);
/// Returns the size of the entire SVE stackframe (calleesaves + spills).
static StackOffset getSVEStackSize(const MachineFunction &MF) {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
if (!EnableRedZone)
return false;
// Don't use the red zone if the function explicitly asks us not to.
// This is typically used for kernel code.
if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
return false;
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
uint64_t NumBytes = AFI->getLocalStackSize();
return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
/// hasFP - Return true if the specified function should have a dedicated frame
/// pointer register.
bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
// Win64 EH requires a frame pointer if funclets are present, as the locals
// are accessed off the frame pointer in both the parent function and the
// funclets.
if (MF.hasEHFunclets())
return true;
// Retain behavior of always omitting the FP for leaf functions when possible.
if (MF.getTarget().Options.DisableFramePointerElim(MF))
return true;
if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
MFI.hasStackMap() || MFI.hasPatchPoint() ||
return true;
// With large callframes around we may need to use FP to access the scavenging
// emergency spillslot.
// Unfortunately some calls to hasFP() like machine verifier ->
// getReservedReg() -> hasFP in the middle of global isel are too early
// to know the max call frame size. Hopefully conservatively returning "true"
// in those cases is fine.
// DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
if (!MFI.isMaxCallFrameSizeComputed() ||
MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
return true;
return false;
/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
/// not required, we reserve argument space for call sites in the function
/// immediately on entry to the current function. This eliminates the need for
/// add/sub sp brackets around call sites. Returns true if the call frame is
/// included as part of the stack frame.
AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
return !MF.getFrameInfo().hasVarSizedObjects();
MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
const AArch64InstrInfo *TII =
static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
DebugLoc DL = I->getDebugLoc();
unsigned Opc = I->getOpcode();
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
if (!hasReservedCallFrame(MF)) {
int64_t Amount = I->getOperand(0).getImm();
Amount = alignTo(Amount, getStackAlign());
if (!IsDestroy)
Amount = -Amount;
// N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
// doesn't have to pop anything), then the first operand will be zero too so
// this adjustment is a no-op.
if (CalleePopAmount == 0) {
// FIXME: in-function stack adjustment for calls is limited to 24-bits
// because there's no guaranteed temporary register available.
// ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
// 1) For offset <= 12-bit, we use LSL #0
// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
// LSL #0, and the other uses LSL #12.
// Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8},
} else if (CalleePopAmount != 0) {
// If the calling convention demands that the callee pops arguments from the
// stack, we want to add it back if we have a reserved call frame.
assert(CalleePopAmount < 0xffffff && "call frame too large");
emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
{-(int64_t)CalleePopAmount, MVT::i8}, TII);
return MBB.erase(I);
static bool ShouldSignReturnAddress(MachineFunction &MF) {
// The function should be signed in the following situations:
// - sign-return-address=all
// - sign-return-address=non-leaf and the functions spills the LR
const Function &F = MF.getFunction();
if (!F.hasFnAttribute("sign-return-address"))
return false;
StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
if (Scope.equals("none"))
return false;
if (Scope.equals("all"))
return true;
assert(Scope.equals("non-leaf") && "Expected all, none or non-leaf");
for (const auto &Info : MF.getFrameInfo().getCalleeSavedInfo())
if (Info.getReg() == AArch64::LR)
return true;
return false;
void AArch64FrameLowering::emitCalleeSavedFrameMoves(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
const TargetSubtargetInfo &STI = MF.getSubtarget();
const MCRegisterInfo *MRI = STI.getRegisterInfo();
const TargetInstrInfo *TII = STI.getInstrInfo();
DebugLoc DL = MBB.findDebugLoc(MBBI);
// Add callee saved registers to move list.
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty())
for (const auto &Info : CSI) {
unsigned Reg = Info.getReg();
int64_t Offset =
MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
// Find a scratch register that we can use at the start of the prologue to
// re-align the stack pointer. We avoid using callee-save registers since they
// may appear to be free when this is called from canUseAsPrologue (during
// shrink wrapping), but then no longer be free when this is called from
// emitPrologue.
// FIXME: This is a bit conservative, since in the above case we could use one
// of the callee-save registers as a scratch temp to re-align the stack pointer,
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
MachineFunction *MF = MBB->getParent();
// If MBB is an entry block, use X9 as the scratch register
if (&MF->front() == MBB)
return AArch64::X9;
const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
LivePhysRegs LiveRegs(TRI);
// Mark callee saved registers as used so we will not choose them.
const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
for (unsigned i = 0; CSRegs[i]; ++i)
// Prefer X9 since it was historically used for the prologue scratch reg.
const MachineRegisterInfo &MRI = MF->getRegInfo();
if (LiveRegs.available(MRI, AArch64::X9))
return AArch64::X9;
for (unsigned Reg : AArch64::GPR64RegClass) {
if (LiveRegs.available(MRI, Reg))
return Reg;
return AArch64::NoRegister;
bool AArch64FrameLowering::canUseAsPrologue(
const MachineBasicBlock &MBB) const {
const MachineFunction *MF = MBB.getParent();
MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
// Don't need a scratch register if we're not going to re-align the stack.
if (!RegInfo->needsStackRealignment(*MF))
return true;
// Otherwise, we can use any block as long as it has a scratch register
// available.
return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
static bool windowsRequiresStackProbe(MachineFunction &MF,
uint64_t StackSizeInBytes) {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
if (!Subtarget.isTargetWindows())
return false;
const Function &F = MF.getFunction();
// TODO: When implementing stack protectors, take that into account
// for the probe threshold.
unsigned StackProbeSize = 4096;
if (F.hasFnAttribute("stack-probe-size"))
.getAsInteger(0, StackProbeSize);
return (StackSizeInBytes >= StackProbeSize) &&
bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
MachineFunction &MF, uint64_t StackBumpBytes) const {
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
if (AFI->getLocalStackSize() == 0)
return false;
// 512 is the maximum immediate for stp/ldp that will be used for
// callee-save save/restores
if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
return false;
if (MFI.hasVarSizedObjects())
return false;
if (RegInfo->needsStackRealignment(MF))
return false;
// This isn't strictly necessary, but it simplifies things a bit since the
// current RedZone handling code assumes the SP is adjusted by the
// callee-save save/restore code.
if (canUseRedZone(MF))
return false;
// When there is an SVE area on the stack, always allocate the
// callee-saves and spills/locals separately.
if (getSVEStackSize(MF))
return false;
return true;
bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
return false;
if (MBB.empty())
return true;
// Disable combined SP bump if the last instruction is an MTE tag store. It
// is almost always better to merge SP adjustment into those instructions.
MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
MachineBasicBlock::iterator Begin = MBB.begin();
while (LastI != Begin) {
if (LastI->isTransient())
if (!LastI->getFlag(MachineInstr::FrameDestroy))
switch (LastI->getOpcode()) {
case AArch64::STGloop:
case AArch64::STZGloop:
case AArch64::STGOffset:
case AArch64::STZGOffset:
case AArch64::ST2GOffset:
case AArch64::STZ2GOffset:
return false;
return true;
// Given a load or a store instruction, generate an appropriate unwinding SEH
// code on Windows.
static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
const TargetInstrInfo &TII,
MachineInstr::MIFlag Flag) {
unsigned Opc = MBBI->getOpcode();
MachineBasicBlock *MBB = MBBI->getParent();
MachineFunction &MF = *MBB->getParent();
DebugLoc DL = MBBI->getDebugLoc();
unsigned ImmIdx = MBBI->getNumOperands() - 1;
int Imm = MBBI->getOperand(ImmIdx).getImm();
MachineInstrBuilder MIB;
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
switch (Opc) {
llvm_unreachable("No SEH Opcode for this instruction");
case AArch64::LDPDpost:
Imm = -Imm;
case AArch64::STPDpre: {
unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
.addImm(Imm * 8)
case AArch64::LDPXpost:
Imm = -Imm;
case AArch64::STPXpre: {
Register Reg0 = MBBI->getOperand(1).getReg();
Register Reg1 = MBBI->getOperand(2).getReg();
if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
.addImm(Imm * 8)
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
.addImm(Imm * 8)
case AArch64::LDRDpost:
Imm = -Imm;
case AArch64::STRDpre: {
unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
case AArch64::LDRXpost:
Imm = -Imm;
case AArch64::STRXpre: {
unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
case AArch64::STPDi:
case AArch64::LDPDi: {
unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
.addImm(Imm * 8)
case AArch64::STPXi:
case AArch64::LDPXi: {
Register Reg0 = MBBI->getOperand(0).getReg();
Register Reg1 = MBBI->getOperand(1).getReg();
if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
.addImm(Imm * 8)
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
.addImm(Imm * 8)
case AArch64::STRXui:
case AArch64::LDRXui: {
int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
.addImm(Imm * 8)
case AArch64::STRDui:
case AArch64::LDRDui: {
unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
.addImm(Imm * 8)
auto I = MBB->insertAfter(MBBI, MIB);
return I;
// Fix up the SEH opcode associated with the save/restore instruction.
static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
unsigned LocalStackSize) {
MachineOperand *ImmOpnd = nullptr;
unsigned ImmIdx = MBBI->getNumOperands() - 1;
switch (MBBI->getOpcode()) {
llvm_unreachable("Fix the offset in the SEH instruction");
case AArch64::SEH_SaveFPLR:
case AArch64::SEH_SaveRegP:
case AArch64::SEH_SaveReg:
case AArch64::SEH_SaveFRegP:
case AArch64::SEH_SaveFReg:
ImmOpnd = &MBBI->getOperand(ImmIdx);
if (ImmOpnd)
ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
// Convert callee-save register save/restore instruction to do stack pointer
// decrement/increment to allocate/deallocate the callee-save stack area by
// converting store/load to use pre/post increment version.
static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
bool NeedsWinCFI, bool *HasWinCFI, bool InProlog = true) {
// Ignore instructions that do not operate on SP, i.e. shadow call stack
// instructions and associated CFI instruction.
while (MBBI->getOpcode() == AArch64::STRXpost ||
MBBI->getOpcode() == AArch64::LDRXpre ||
MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
assert(MBBI->getOperand(0).getReg() != AArch64::SP);
unsigned NewOpc;
int Scale = 1;
switch (MBBI->getOpcode()) {
llvm_unreachable("Unexpected callee-save save/restore opcode!");
case AArch64::STPXi:
NewOpc = AArch64::STPXpre;
Scale = 8;
case AArch64::STPDi:
NewOpc = AArch64::STPDpre;
Scale = 8;
case AArch64::STPQi:
NewOpc = AArch64::STPQpre;
Scale = 16;
case AArch64::STRXui:
NewOpc = AArch64::STRXpre;
case AArch64::STRDui:
NewOpc = AArch64::STRDpre;
case AArch64::STRQui:
NewOpc = AArch64::STRQpre;
case AArch64::LDPXi:
NewOpc = AArch64::LDPXpost;
Scale = 8;
case AArch64::LDPDi:
NewOpc = AArch64::LDPDpost;
Scale = 8;
case AArch64::LDPQi:
NewOpc = AArch64::LDPQpost;
Scale = 16;
case AArch64::LDRXui:
NewOpc = AArch64::LDRXpost;
case AArch64::LDRDui:
NewOpc = AArch64::LDRDpost;
case AArch64::LDRQui:
NewOpc = AArch64::LDRQpost;
// Get rid of the SEH code associated with the old instruction.
if (NeedsWinCFI) {
auto SEH = std::next(MBBI);
if (AArch64InstrInfo::isSEHInstruction(*SEH))
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
MIB.addReg(AArch64::SP, RegState::Define);
// Copy all operands other than the immediate offset.
unsigned OpndIdx = 0;
for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
"Unexpected immediate offset in first/last callee-save save/restore "
assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
"Unexpected base register in callee-save save/restore instruction!");
assert(CSStackSizeInc % Scale == 0);
MIB.addImm(CSStackSizeInc / Scale);
// Generate a new SEH code that corresponds to the new instruction.
if (NeedsWinCFI) {
*HasWinCFI = true;
InsertSEH(*MIB, *TII,
InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
return std::prev(MBB.erase(MBBI));
// Fixup callee-save register save/restore instructions to take into account
// combined SP bump by adding the local stack size to the stack offsets.
static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
uint64_t LocalStackSize,
bool NeedsWinCFI,
bool *HasWinCFI) {
if (AArch64InstrInfo::isSEHInstruction(MI))
unsigned Opc = MI.getOpcode();
// Ignore instructions that do not operate on SP, i.e. shadow call stack
// instructions and associated CFI instruction.
if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
Opc == AArch64::CFI_INSTRUCTION) {
if (Opc != AArch64::CFI_INSTRUCTION)
assert(MI.getOperand(0).getReg() != AArch64::SP);
unsigned Scale;
switch (Opc) {
case AArch64::STPXi:
case AArch64::STRXui:
case AArch64::STPDi:
case AArch64::STRDui:
case AArch64::LDPXi:
case AArch64::LDRXui:
case AArch64::LDPDi:
case AArch64::LDRDui:
Scale = 8;
case AArch64::STPQi:
case AArch64::STRQui:
case AArch64::LDPQi:
case AArch64::LDRQui:
Scale = 16;
llvm_unreachable("Unexpected callee-save save/restore opcode!");
unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
"Unexpected base register in callee-save save/restore instruction!");
// Last operand is immediate offset that needs fixing.
MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
// All generated opcodes have scaled offsets.
assert(LocalStackSize % Scale == 0);
OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
if (NeedsWinCFI) {
*HasWinCFI = true;
auto MBBI = std::next(MachineBasicBlock::iterator(MI));
assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
"Expecting a SEH instruction");
fixupSEHOpcode(MBBI, LocalStackSize);
static void adaptForLdStOpt(MachineBasicBlock &MBB,
MachineBasicBlock::iterator FirstSPPopI,
MachineBasicBlock::iterator LastPopI) {
// Sometimes (when we restore in the same order as we save), we can end up
// with code like this:
// ldp x26, x25, [sp]
// ldp x24, x23, [sp, #16]
// ldp x22, x21, [sp, #32]
// ldp x20, x19, [sp, #48]
// add sp, sp, #64
// In this case, it is always better to put the first ldp at the end, so
// that the load-store optimizer can run and merge the ldp and the add into
// a post-index ldp.
// If we managed to grab the first pop instruction, move it to the end.
if (ReverseCSRRestoreSeq)
MBB.splice(FirstSPPopI, &MBB, LastPopI);
// We should end up with something like this now:
// ldp x24, x23, [sp, #16]
// ldp x22, x21, [sp, #32]
// ldp x20, x19, [sp, #48]
// ldp x26, x25, [sp]
// add sp, sp, #64
// and the load-store optimizer can merge the last two instructions into:
// ldp x26, x25, [sp], #64
static bool ShouldSignWithAKey(MachineFunction &MF) {
const Function &F = MF.getFunction();
if (!F.hasFnAttribute("sign-return-address-key"))
return true;
const StringRef Key =
assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
return Key.equals_lower("a_key");
static bool needsWinCFI(const MachineFunction &MF) {
const Function &F = MF.getFunction();
return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
static bool isTargetDarwin(const MachineFunction &MF) {
return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
static bool isTargetWindows(const MachineFunction &MF) {
return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
// Convenience function to determine whether I is an SVE callee save.
static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
switch (I->getOpcode()) {
return false;
case AArch64::STR_ZXI:
case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
case AArch64::LDR_PXI:
return I->getFlag(MachineInstr::FrameSetup) ||
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
const MachineFrameInfo &MFI = MF.getFrameInfo();
const Function &F = MF.getFunction();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool needsFrameMoves =
MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool HasFP = hasFP(MF);
bool NeedsWinCFI = needsWinCFI(MF);
bool HasWinCFI = false;
auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
bool IsFunclet = MBB.isEHFuncletEntry();
// At this point, we're going to decide whether or not the function uses a
// redzone. In most cases, the function doesn't have a redzone so let's
// assume that's false and set it to true in the case that there's a redzone.
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
DebugLoc DL;
if (ShouldSignReturnAddress(MF)) {
if (ShouldSignWithAKey(MF))
BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
else {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
unsigned CFIIndex =
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
// Set tagged base pointer to the bottom of the stack frame.
// Ideally it should match SP value after prologue.
const StackOffset &SVEStackSize = getSVEStackSize(MF);
// getStackSize() includes all the locals in its size calculation. We don't
// include these locals when computing the stack size of a funclet, as they
// are allocated in the parent's stack frame and accessed via the frame
// pointer from the funclet. We only save the callee saved registers in the
// funclet, which are really the callee saved registers of the parent
// function, including the funclet.
int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
: MFI.getStackSize();
if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
assert(!HasFP && "unexpected function without stack frame but with FP");
assert(!SVEStackSize &&
"unexpected function without stack frame but with SVE objects");
// All of the stack allocation is for locals.
if (!NumBytes)
// REDZONE: If the stack size is less than 128 bytes, we don't need
// to actually allocate.
if (canUseRedZone(MF)) {
} else {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
false, NeedsWinCFI, &HasWinCFI);
if (!NeedsWinCFI && needsFrameMoves) {
// Label used to tie together the PROLOG_LABEL and the MachineMoves.
MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
// Encode the stack size of the leaf function.
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
bool IsWin64 =
unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
// All of the remaining stack allocations are for locals.
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
if (CombineSPBump) {
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
NeedsWinCFI, &HasWinCFI);
NumBytes = 0;
} else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
// Move past the saves of the callee-saved registers, fixing up the offsets
// and pre-inc if we decided to combine the callee-save and local stack
// pointer bump above.
MachineBasicBlock::iterator End = MBB.end();
while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
!IsSVECalleeSave(MBBI)) {
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
// For funclets the FP belongs to the containing function.
if (!IsFunclet && HasFP) {
// Only set up FP if we actually need to.
int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
if (CombineSPBump)
FPOffset += AFI->getLocalStackSize();
// Issue sub fp, sp, FPOffset or
// mov fp,sp when FPOffset is zero.
// Note: All stores of callee-saved registers are marked as "FrameSetup".
// This code marks the instruction(s) that set the FP also.
emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
{FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false,
NeedsWinCFI, &HasWinCFI);
if (windowsRequiresStackProbe(MF, NumBytes)) {
uint64_t NumWords = NumBytes >> 4;
if (NeedsWinCFI) {
HasWinCFI = true;
// alloc_l can hold at most 256MB, so assume that NumBytes doesn't
// exceed this amount. We need to move at most 2^24 - 1 into x15.
// This is at most two instructions, MOVZ follwed by MOVK.
// TODO: Fix to use multiple stack alloc unwind codes for stacks
// exceeding 256MB in size.
if (NumBytes >= (1 << 28))
report_fatal_error("Stack size cannot exceed 256MB for stack "
"unwinding purposes");
uint32_t LowNumWords = NumWords & 0xFFFF;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
if ((NumWords & 0xFFFF0000) != 0) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
.addImm((NumWords & 0xFFFF0000) >> 16) // High half
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
} else {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
switch (MF.getTarget().getCodeModel()) {
case CodeModel::Tiny:
case CodeModel::Small:
case CodeModel::Medium:
case CodeModel::Kernel:
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
.addReg(AArch64::X15, RegState::Implicit)
.addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
.addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
.addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
case CodeModel::Large:
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
.addReg(AArch64::X16, RegState::Define)
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
.addReg(AArch64::X16, RegState::Kill)
.addReg(AArch64::X15, RegState::Implicit | RegState::Define)
.addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
.addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
.addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
.addReg(AArch64::SP, RegState::Kill)
.addReg(AArch64::X15, RegState::Kill)
.addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
NumBytes = 0;
StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
// Process the SVE callee-saves to determine what space needs to be
// allocated.
- if (AFI->getSVECalleeSavedStackSize()) {
+ if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
// Find callee save instructions in frame.
CalleeSavesBegin = MBBI;
assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
CalleeSavesEnd = MBBI;
- int64_t OffsetToFirstCalleeSaveFromSP =
- MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
- StackOffset OffsetToCalleeSavesFromSP =
- StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
- AllocateBefore -= OffsetToCalleeSavesFromSP;
+ AllocateBefore = {CalleeSavedSize, MVT::nxv1i8};
AllocateAfter = SVEStackSize - AllocateBefore;
// Allocate space for the callee saves (if any).
emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
-AllocateBefore, TII,
// Finally allocate remaining SVE stack space.
emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
-AllocateAfter, TII,
// Allocate space for the rest of the frame.
if (NumBytes) {
// Alignment is required for the parent frame, not the funclet
const bool NeedsRealignment =
!IsFunclet && RegInfo->needsStackRealignment(MF);
unsigned scratchSPReg = AArch64::SP;
if (NeedsRealignment) {
scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
assert(scratchSPReg != AArch64::NoRegister);
// If we're a leaf function, try using the red zone.
if (!canUseRedZone(MF))
// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
// the correct value here, as NumBytes also includes padding bytes,
// which shouldn't be counted here.
emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
false, NeedsWinCFI, &HasWinCFI);
if (NeedsRealignment) {
const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
assert(NrBitsToZero > 1);
assert(scratchSPReg != AArch64::SP);
// SUB X9, SP, NumBytes
// -- X9 is temporary register, so shouldn't contain any live data here,
// -- free to use. This is already produced by emitFrameOffset above.
// AND SP, X9, 0b11111...0000
// The logical immediates have a non-trivial encoding. The following
// formula computes the encoded immediate with all ones but
// NrBitsToZero zero bits as least significant bits.
uint32_t andMaskEncoded = (1 << 12) // = N
| ((64 - NrBitsToZero) << 6) // immr
| ((64 - NrBitsToZero - 1) << 0); // imms
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
.addReg(scratchSPReg, RegState::Kill)
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
.addImm(NumBytes & andMaskEncoded)
// If we need a base pointer, set it up here. It's whatever the value of the
// stack pointer is at this point. Any variable size objects will be allocated
// after this, so we can still use the base pointer to reference locals.
// FIXME: Clarify FrameSetup flags here.
// Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
// needed.
// For funclets the BP belongs to the containing function.
if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
// The very last FrameSetup instruction indicates the end of prologue. Emit a
// SEH opcode indicating the prologue end.
if (NeedsWinCFI && HasWinCFI) {
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
// SEH funclets are passed the frame pointer in X1. If the parent
// function uses the base register, then the base register is used
// directly, and is not retrieved from X1.
if (IsFunclet && F.hasPersonalityFn()) {
EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
if (isAsynchronousEHPersonality(Per)) {
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
if (needsFrameMoves) {
const DataLayout &TD = MF.getDataLayout();
const int StackGrowth = isTargetDarwin(MF)
? (2 * -TD.getPointerSize(0))
: -AFI->getCalleeSavedStackSize();
Register FramePtr = RegInfo->getFrameRegister(MF);
// An example of the prologue:
// .globl __foo
// .align 2
// __foo:
// Ltmp0:
// .cfi_startproc
// .cfi_personality 155, ___gxx_personality_v0
// Leh_func_begin:
// .cfi_lsda 16, Lexception33
// stp xa,bx, [sp, -#offset]!
// ...
// stp x28, x27, [sp, #offset-32]
// stp fp, lr, [sp, #offset-16]
// add fp, sp, #offset - 16
// sub sp, sp, #1360
// The Stack:
// +-------------------------------------------+
// 10000 | ........ | ........ | ........ | ........ |
// 10004 | ........ | ........ | ........ | ........ |
// +-------------------------------------------+
// 10008 | ........ | ........ | ........ | ........ |
// 1000c | ........ | ........ | ........ | ........ |
// +===========================================+
// 10010 | X28 Register |
// 10014 | X28 Register |
// +-------------------------------------------+
// 10018 | X27 Register |
// 1001c | X27 Register |
// +===========================================+
// 10020 | Frame Pointer |
// 10024 | Frame Pointer |
// +-------------------------------------------+
// 10028 | Link Register |
// 1002c | Link Register |
// +===========================================+
// 10030 | ........ | ........ | ........ | ........ |
// 10034 | ........ | ........ | ........ | ........ |
// +-------------------------------------------+
// 10038 | ........ | ........ | ........ | ........ |
// 1003c | ........ | ........ | ........ | ........ |
// +-------------------------------------------+
// [sp] = 10030 :: >>initial value<<
// sp = 10020 :: stp fp, lr, [sp, #-16]!
// fp = sp == 10020 :: mov fp, sp
// [sp] == 10020 :: stp x28, x27, [sp, #-16]!
// sp == 10010 :: >>final value<<
// The frame pointer (w29) points to address 10020. If we use an offset of
// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
// for w27, and -32 for w28:
// Ltmp1:
// .cfi_def_cfa w29, 16
// Ltmp2:
// .cfi_offset w30, -8
// Ltmp3:
// .cfi_offset w29, -16
// Ltmp4:
// .cfi_offset w27, -24
// Ltmp5:
// .cfi_offset w28, -32
if (HasFP) {
// Define the current CFA rule to use the provided FP.
unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
} else {
// Encode the stack size of the leaf function.
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
// Now emit the moves for whatever callee saved regs we have (including FP,
// LR if those are saved).
emitCalleeSavedFrameMoves(MBB, MBBI);
static void InsertReturnAddressAuth(MachineFunction &MF,
MachineBasicBlock &MBB) {
if (!ShouldSignReturnAddress(MF))
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
DebugLoc DL;
if (MBBI != MBB.end())
DL = MBBI->getDebugLoc();
// The AUTIASP instruction assembles to a hint instruction before v8.3a so
// this instruction can safely used for any v8a architecture.
// From v8.3a onwards there are optimised authenticate LR and return
// instructions, namely RETA{A,B}, that can be used instead.
if (Subtarget.hasV8_3aOps() && MBBI != MBB.end() &&
MBBI->getOpcode() == AArch64::RET_ReallyLR) {
TII->get(ShouldSignWithAKey(MF) ? AArch64::RETAA : AArch64::RETAB))
} else {
TII->get(ShouldSignWithAKey(MF) ? AArch64::AUTIASP : AArch64::AUTIBSP))
static bool isFuncletReturnInstr(const MachineInstr &MI) {
switch (MI.getOpcode()) {
return false;
case AArch64::CATCHRET:
case AArch64::CLEANUPRET:
return true;
void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
bool NeedsWinCFI = needsWinCFI(MF);
bool HasWinCFI = false;
bool IsFunclet = false;
auto WinCFI = make_scope_exit([&]() {
if (!MF.hasWinCFI())
if (MBB.end() != MBBI) {
DL = MBBI->getDebugLoc();
IsFunclet = isFuncletReturnInstr(*MBBI);
int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
: MFI.getStackSize();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
// Initial and residual are named for consistency with the prologue. Note that
// in the epilogue, the residual adjustment is executed first.
uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);
// The stack frame should be like below,
// ---------------------- ---
// | | |
// | BytesInStackArgArea| CalleeArgStackSize
// | (NumReusableBytes) | (of tail call)
// | | ---
// | | |
// ---------------------| --- |
// | | | |
// | CalleeSavedReg | | |
// | (CalleeSavedStackSize)| | |
// | | | |
// ---------------------| | NumBytes
// | | StackSize (StackAdjustUp)
// | LocalStackSize | | |
// | (covering callee | | |
// | args) | | |
// | | | |
// ---------------------- --- ---
// So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
// = StackSize + ArgumentPopSize
// AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
// it as the 2nd argument of AArch64ISD::TC_RETURN.
auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
bool IsWin64 =
unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
uint64_t AfterCSRPopSize = ArgumentPopSize;
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
// We cannot rely on the local stack size set in emitPrologue if the function
// has funclets, as funclets have different local stack size requirements, and
// the current value set in emitPrologue may be that of the containing
// function.
if (MF.hasEHFunclets())
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
// Assume we can't combine the last pop with the sp restore.
if (!CombineSPBump && PrologueSaveSize != 0) {
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
while (AArch64InstrInfo::isSEHInstruction(*Pop))
Pop = std::prev(Pop);
// Converting the last ldp to a post-index ldp is valid only if the last
// ldp's offset is 0.
const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
// If the offset is 0, convert it to a post-index ldp.
if (OffsetOp.getImm() == 0)
MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false);
else {
// If not, make sure to emit an add after the last ldp.
// We're doing this by transfering the size to be restored from the
// adjustment *before* the CSR pops to the adjustment *after* the CSR
// pops.
AfterCSRPopSize += PrologueSaveSize;
// Move past the restores of the callee-saved registers.
// If we plan on combining the sp bump of the local stack size and the callee
// save stack size, we might need to adjust the CSR save and restore offsets.
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
MachineBasicBlock::iterator Begin = MBB.begin();
while (LastPopI != Begin) {
if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
IsSVECalleeSave(LastPopI)) {
} else if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
const StackOffset &SVEStackSize = getSVEStackSize(MF);
// If there is a single SP update, insert it before the ret and we're done.
if (CombineSPBump) {
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
{NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL,
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
// Process the SVE callee-saves to determine what space needs to be
// deallocated.
StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
- if (AFI->getSVECalleeSavedStackSize()) {
+ if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
RestoreBegin = std::prev(RestoreEnd);;
while (IsSVECalleeSave(RestoreBegin) &&
RestoreBegin != MBB.begin())
assert(IsSVECalleeSave(RestoreBegin) &&
IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
- int64_t OffsetToFirstCalleeSaveFromSP =
- MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
- StackOffset OffsetToCalleeSavesFromSP =
- StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
- DeallocateBefore = OffsetToCalleeSavesFromSP;
- DeallocateAfter = SVEStackSize - DeallocateBefore;
+ StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8};
+ DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
+ DeallocateAfter = CalleeSavedSizeAsOffset;
// Deallocate the SVE area.
if (SVEStackSize) {
if (AFI->isStackRealigned()) {
- if (AFI->getSVECalleeSavedStackSize())
- // Set SP to start of SVE area, from which the callee-save reloads
- // can be done. The code below will deallocate the stack space
+ if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
+ // Set SP to start of SVE callee-save area from which they can
+ // be reloaded. The code below will deallocate the stack space
// space by moving FP -> SP.
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
- -SVEStackSize, TII, MachineInstr::FrameDestroy);
+ {-CalleeSavedSize, MVT::nxv1i8}, TII,
+ MachineInstr::FrameDestroy);
} else {
if (AFI->getSVECalleeSavedStackSize()) {
// Deallocate the non-SVE locals first before we can deallocate (and
// restore callee saves) from the SVE area.
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
{NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy);
NumBytes = 0;
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
DeallocateBefore, TII, MachineInstr::FrameDestroy);
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
DeallocateAfter, TII, MachineInstr::FrameDestroy);
if (!hasFP(MF)) {
bool RedZone = canUseRedZone(MF);
// If this was a redzone leaf function, we don't need to restore the
// stack pointer (but we may need to pop stack args for fastcc).
if (RedZone && AfterCSRPopSize == 0)
bool NoCalleeSaveRestore = PrologueSaveSize == 0;
int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
if (NoCalleeSaveRestore)
StackRestoreBytes += AfterCSRPopSize;
// If we were able to combine the local stack pop with the argument pop,
// then we're done.
bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
// If we're done after this, make sure to help the load store optimizer.
if (Done)
adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
{StackRestoreBytes, MVT::i8}, TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
if (Done) {
if (NeedsWinCFI) {
HasWinCFI = true;
BuildMI(MBB, MBB.getFirstTerminator(), DL,
NumBytes = 0;
// Restore the original stack pointer.
// FIXME: Rather than doing the math here, we should instead just use
// non-post-indexed loads for the restores if we aren't actually going to
// be able to save any instructions.
if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
int64_t OffsetToFrameRecord =
isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0;
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
{OffsetToFrameRecord, MVT::i8},
TII, MachineInstr::FrameDestroy, false, NeedsWinCFI);
} else if (NumBytes)
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
{NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false,
// This must be placed after the callee-save restore code because that code
// assumes the SP is at the same location as it was after the callee-save save
// code in the prologue.
if (AfterCSRPopSize) {
// Find an insertion point for the first ldp so that it goes before the
// shadow call stack epilog instruction. This ensures that the restore of
// lr from x18 is placed after the restore from sp.
auto FirstSPPopI = MBB.getFirstTerminator();
while (FirstSPPopI != Begin) {
auto Prev = std::prev(FirstSPPopI);
if (Prev->getOpcode() != AArch64::LDRXpre ||
Prev->getOperand(0).getReg() == AArch64::SP)
FirstSPPopI = Prev;
adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
{(int64_t)AfterCSRPopSize, MVT::i8}, TII,
MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
if (NeedsWinCFI && HasWinCFI)
BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
/// debug info. It's the same as what we use for resolving the code-gen
/// references for now. FIXME: This can go wrong when references are
/// SP-relative and simple call frames aren't used.
int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
int FI,
Register &FrameReg) const {
return resolveFrameIndexReference(
MF, FI, FrameReg,
int AArch64FrameLowering::getNonLocalFrameIndexReference(
const MachineFunction &MF, int FI) const {
return getSEHFrameIndexOffset(MF, FI);
static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) {
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
bool IsWin64 =
unsigned FixedObject =
getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
unsigned FPAdjust = isTargetDarwin(MF)
? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
static StackOffset getStackOffset(const MachineFunction &MF, int64_t ObjectOffset) {
const auto &MFI = MF.getFrameInfo();
return {ObjectOffset + (int64_t)MFI.getStackSize(), MVT::i8};
int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
int FI) const {
const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
? getFPOffset(MF, ObjectOffset).getBytes()
: getStackOffset(MF, ObjectOffset).getBytes();
StackOffset AArch64FrameLowering::resolveFrameIndexReference(
const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
int64_t ObjectOffset = MFI.getObjectOffset(FI);
bool isFixed = MFI.isFixedObjectIndex(FI);
bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
PreferFP, ForSimm);
StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
Register &FrameReg, bool PreferFP, bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
int64_t FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
int64_t Offset = getStackOffset(MF, ObjectOffset).getBytes();
bool isCSR =
!isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
const StackOffset &SVEStackSize = getSVEStackSize(MF);
// Use frame pointer to reference fixed objects. Use it for locals if
// there are VLAs or a dynamically realigned SP (and thus the SP isn't
// reliable as a base). Make sure useFPForScavengingIndex() does the
// right thing for the emergency spill slot.
bool UseFP = false;
if (AFI->hasStackFrame() && !isSVE) {
// We shouldn't prefer using the FP when there is an SVE area
// in between the FP and the non-SVE locals/spills.
PreferFP &= !SVEStackSize;
// Note: Keeping the following as multiple 'if' statements rather than
// merging to a single expression for readability.
// Argument access should always use the FP.
if (isFixed) {
UseFP = hasFP(MF);
} else if (isCSR && RegInfo->needsStackRealignment(MF)) {
// References to the CSR area must use FP if we're re-aligning the stack
// since the dynamically-sized alignment padding is between the SP/BP and
// the CSR area.
assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
UseFP = true;
} else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
// If the FPOffset is negative and we're producing a signed immediate, we
// have to keep in mind that the available offset range for negative
// offsets is smaller than for positive ones. If an offset is available
// via the FP and the SP, use whichever is closest.
bool FPOffsetFits = !ForSimm || FPOffset >= -256;
PreferFP |= Offset > -FPOffset;
if (MFI.hasVarSizedObjects()) {
// If we have variable sized objects, we can use either FP or BP, as the
// SP offset is unknown. We can use the base pointer if we have one and
// FP is not preferred. If not, we're stuck with using FP.
bool CanUseBP = RegInfo->hasBasePointer(MF);
if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
UseFP = PreferFP;
else if (!CanUseBP) // Can't use BP. Forced to use FP.
UseFP = true;
// else we can use BP and FP, but the offset from FP won't fit.
// That will make us scavenge registers which we can probably avoid by
// using BP. If it won't fit for BP either, we'll scavenge anyway.
} else if (FPOffset >= 0) {
// Use SP or FP, whichever gives us the best chance of the offset
// being in range for direct access. If the FPOffset is positive,
// that'll always be best, as the SP will be even further away.
UseFP = true;
} else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
// Funclets access the locals contained in the parent's stack frame
// via the frame pointer, so we have to use the FP in the parent
// function.
(void) Subtarget;
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
"Funclets should only be present on Win64");
UseFP = true;
} else {
// We have the choice between FP and (SP or BP).
if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
UseFP = true;
assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
"In the presence of dynamic stack pointer realignment, "
"non-argument/CSR objects cannot be accessed through the frame pointer");
if (isSVE) {
int64_t OffsetToSVEArea =
MFI.getStackSize() - AFI->getCalleeSavedStackSize();
StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8};
StackOffset SPOffset = SVEStackSize +
StackOffset(ObjectOffset, MVT::nxv1i8) +
StackOffset(OffsetToSVEArea, MVT::i8);
// Always use the FP for SVE spills if available and beneficial.
if (hasFP(MF) &&
(SPOffset.getBytes() ||
FPOffset.getScalableBytes() < SPOffset.getScalableBytes() ||
RegInfo->needsStackRealignment(MF))) {
FrameReg = RegInfo->getFrameRegister(MF);
return FPOffset;
FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
: (unsigned)AArch64::SP;
return SPOffset;
StackOffset ScalableOffset = {};
if (UseFP && !(isFixed || isCSR))
ScalableOffset = -SVEStackSize;
if (!UseFP && (isFixed || isCSR))
ScalableOffset = SVEStackSize;
if (UseFP) {
FrameReg = RegInfo->getFrameRegister(MF);
return StackOffset(FPOffset, MVT::i8) + ScalableOffset;
// Use the base pointer if we have one.
if (RegInfo->hasBasePointer(MF))
FrameReg = RegInfo->getBaseRegister();
else {
assert(!MFI.hasVarSizedObjects() &&
"Can't use SP when we have var sized objects.");
FrameReg = AArch64::SP;
// If we're using the red zone for this function, the SP won't actually
// be adjusted, so the offsets will be negative. They're also all
// within range of the signed 9-bit immediate instructions.
if (canUseRedZone(MF))
Offset -= AFI->getLocalStackSize();
return StackOffset(Offset, MVT::i8) + ScalableOffset;
static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
// Do not set a kill flag on values that are also marked as live-in. This
// happens with the @llvm-returnaddress intrinsic and with arguments passed in
// callee saved registers.
// Omitting the kill flags is conservatively correct even if the live-in
// is not used after all.
bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
return getKillRegState(!IsLiveIn);
static bool produceCompactUnwindFrame(MachineFunction &MF) {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AttributeList Attrs = MF.getFunction().getAttributes();
return Subtarget.isTargetMachO() &&
!(Subtarget.getTargetLowering()->supportSwiftError() &&
static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
bool NeedsWinCFI) {
// If we are generating register pairs for a Windows function that requires
// EH support, then pair consecutive registers only. There are no unwind
// opcodes for saves/restores of non-consectuve register pairs.
// The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x.
// TODO: LR can be paired with any register. We don't support this yet in
// the MCLayer. We need to add support for the save_lrpair unwind code.
if (Reg2 == AArch64::FP)
return true;
if (!NeedsWinCFI)
return false;
if (Reg2 == Reg1 + 1)
return false;
return true;
/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
/// WindowsCFI requires that only consecutive registers can be paired.
/// LR and FP need to be allocated together when the frame needs to save
/// the frame-record. This means any other register pairing with LR is invalid.
static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord) {
if (UsesWinAAPCS)
return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI);
// If we need to store the frame record, don't pair any register
// with LR other than FP.
if (NeedsFrameRecord)
return Reg2 == AArch64::LR;
return false;
namespace {
struct RegPairInfo {
unsigned Reg1 = AArch64::NoRegister;
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
RegPairInfo() = default;
bool isPaired() const { return Reg2 != AArch64::NoRegister; }
unsigned getScale() const {
switch (Type) {
case PPR:
return 2;
case GPR:
case FPR64:
return 8;
case ZPR:
case FPR128:
return 16;
llvm_unreachable("Unsupported type");
bool isScalable() const { return Type == PPR || Type == ZPR; }
} // end anonymous namespace
static void computeCalleeSaveRegisterPairs(
MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
if (CSI.empty())
bool IsWindows = isTargetWindows(MF);
bool NeedsWinCFI = needsWinCFI(MF);
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
CallingConv::ID CC = MF.getFunction().getCallingConv();
unsigned Count = CSI.size();
// MachO's compact unwind format relies on all registers being stored in
// pairs.
assert((!produceCompactUnwindFrame(MF) ||
CC == CallingConv::PreserveMost ||
(Count & 1) == 0) &&
"Odd number of callee-saved regs to spill!");
int ByteOffset = AFI->getCalleeSavedStackSize();
int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
// On Linux, we will have either one or zero non-paired register. On Windows
// with CFI, we can have multiple unpaired registers in order to utilize the
// available unwind codes. This flag assures that the alignment fixup is done
// only once, as intened.
bool FixupDone = false;
for (unsigned i = 0; i < Count; ++i) {
RegPairInfo RPI;
RPI.Reg1 = CSI[i].getReg();
if (AArch64::GPR64RegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::GPR;
else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::FPR64;
else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::FPR128;
else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::ZPR;
else if (AArch64::PPRRegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::PPR;
llvm_unreachable("Unsupported register class.");
// Add the next reg to the pair if it is in the same register class.
if (i + 1 < Count) {
unsigned NextReg = CSI[i + 1].getReg();
switch (RPI.Type) {
case RegPairInfo::GPR:
if (AArch64::GPR64RegClass.contains(NextReg) &&
!invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, NeedsWinCFI,
RPI.Reg2 = NextReg;
case RegPairInfo::FPR64:
if (AArch64::FPR64RegClass.contains(NextReg) &&
!invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
RPI.Reg2 = NextReg;
case RegPairInfo::FPR128:
if (AArch64::FPR128RegClass.contains(NextReg))
RPI.Reg2 = NextReg;
case RegPairInfo::PPR:
case RegPairInfo::ZPR:
// If either of the registers to be saved is the lr register, it means that
// we also need to save lr in the shadow call stack.
if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
report_fatal_error("Must reserve x18 to use shadow call stack");
NeedShadowCallStackProlog = true;
// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
// list to come in sorted by frame index so that we can issue the store
// pair instructions directly. Assert if we see anything otherwise.
// The order of the registers in the list is controlled by
// getCalleeSavedRegs(), so they will always be in-order, as well.
assert((!RPI.isPaired() ||
(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
"Out of order callee saved regs!");
assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
RPI.Reg1 == AArch64::LR) &&
"FrameRecord must be allocated together with LR");
// Windows AAPCS has FP and LR reversed.
assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
RPI.Reg2 == AArch64::LR) &&
"FrameRecord must be allocated together with LR");
// MachO's compact unwind format relies on all registers being stored in
// adjacent register pairs.
assert((!produceCompactUnwindFrame(MF) ||
CC == CallingConv::PreserveMost ||
(RPI.isPaired() &&
((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
RPI.Reg1 + 1 == RPI.Reg2))) &&
"Callee-save registers not saved as adjacent register pair!");
RPI.FrameIdx = CSI[i].getFrameIdx();
int Scale = RPI.getScale();
if (RPI.isScalable())
ScalableByteOffset -= Scale;
ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale;
assert(!(RPI.isScalable() && RPI.isPaired()) &&
"Paired spill/fill instructions don't exist for SVE vectors");
// Round up size of non-pair to pair size if we need to pad the
// callee-save area to ensure 16-byte alignment.
if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
!RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
!RPI.isPaired()) {
FixupDone = true;
ByteOffset -= 8;
assert(ByteOffset % 16 == 0);
assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
assert(Offset % Scale == 0);
RPI.Offset = Offset / Scale;
assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
(RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
"Offset out of bounds for LDP/STP immediate");
if (RPI.isPaired())
bool AArch64FrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
bool NeedsWinCFI = needsWinCFI(MF);
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
bool NeedShadowCallStackProlog = false;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
NeedShadowCallStackProlog, hasFP(MF));
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (NeedShadowCallStackProlog) {
// Shadow call stack prolog: str x30, [x18], #8
BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
.addReg(AArch64::X18, RegState::Define)
if (NeedsWinCFI)
BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
// Emit a CFI instruction that causes 8 to be subtracted from the value of
// x18 when unwinding past this frame.
static const char CFIInst[] = {
18, // register
2, // length
static_cast<char>(-8) & 0x7f, // addend (sleb128)
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
nullptr, StringRef(CFIInst, sizeof(CFIInst))));
// This instruction also makes x18 live-in to the entry block.
for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
++RPII) {
RegPairInfo RPI = *RPII;
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
unsigned StrOpc;
// Issue sequence of spills for cs regs. The first spill may be converted
// to a pre-decrement store later by emitPrologue if the callee-save stack
// area allocation can't be combined with the local stack area allocation.
// For example:
// stp x22, x21, [sp, #0] // addImm(+0)
// stp x20, x19, [sp, #16] // addImm(+2)
// stp fp, lr, [sp, #32] // addImm(+4)
// Rationale: This sequence saves uop updates compared to a sequence of
// pre-increment spills like stp xi,xj,[sp,#-16]!
// Note: Similar rationale and sequence for restores in epilog.
unsigned Size;
Align Alignment;
switch (RPI.Type) {
case RegPairInfo::GPR:
StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
Size = 8;
Alignment = Align(8);
case RegPairInfo::FPR64:
StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
Size = 8;
Alignment = Align(8);
case RegPairInfo::FPR128:
StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
Size = 16;
Alignment = Align(16);
case RegPairInfo::ZPR:
StrOpc = AArch64::STR_ZXI;
Size = 16;
Alignment = Align(16);
case RegPairInfo::PPR:
StrOpc = AArch64::STR_PXI;
Size = 2;
Alignment = Align(2);
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
dbgs() << ") -> fi#(" << RPI.FrameIdx;
if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
dbgs() << ")\n");
assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
"Windows unwdinding requires a consecutive (FP,LR) pair");
// Windows unwind codes require consecutive registers if registers are
// paired. Make the switch here, so that the code below will save (x,x+1)
// and not (x+1,x).
unsigned FrameIdxReg1 = RPI.FrameIdx;
unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
if (NeedsWinCFI && RPI.isPaired()) {
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
if (!MRI.isReserved(Reg1))
if (RPI.isPaired()) {
if (!MRI.isReserved(Reg2))
MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOStore, Size, Alignment));
MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
.addImm(RPI.Offset) // [sp, #offset*scale],
// where factor*scale is implicit
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
MachineMemOperand::MOStore, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameSetup);
// Update the StackIDs of the SVE stack slots.
MachineFrameInfo &MFI = MF.getFrameInfo();
if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector);
return true;
bool AArch64FrameLowering::restoreCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
bool NeedsWinCFI = needsWinCFI(MF);
if (MI != MBB.end())
DL = MI->getDebugLoc();
bool NeedShadowCallStackProlog = false;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
NeedShadowCallStackProlog, hasFP(MF));
auto EmitMI = [&](const RegPairInfo &RPI) {
unsigned Reg1 = RPI.Reg1;
unsigned Reg2 = RPI.Reg2;
// Issue sequence of restores for cs regs. The last restore may be converted
// to a post-increment load later by emitEpilogue if the callee-save stack
// area allocation can't be combined with the local stack area allocation.
// For example:
// ldp fp, lr, [sp, #32] // addImm(+4)
// ldp x20, x19, [sp, #16] // addImm(+2)
// ldp x22, x21, [sp, #0] // addImm(+0)
// Note: see comment in spillCalleeSavedRegisters()
unsigned LdrOpc;
unsigned Size;
Align Alignment;
switch (RPI.Type) {
case RegPairInfo::GPR:
LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
Size = 8;
Alignment = Align(8);
case RegPairInfo::FPR64:
LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
Size = 8;
Alignment = Align(8);
case RegPairInfo::FPR128:
LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
Size = 16;
Alignment = Align(16);
case RegPairInfo::ZPR:
LdrOpc = AArch64::LDR_ZXI;
Size = 16;
Alignment = Align(16);
case RegPairInfo::PPR:
LdrOpc = AArch64::LDR_PXI;
Size = 2;
Alignment = Align(2);
LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
dbgs() << ") -> fi#(" << RPI.FrameIdx;
if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
dbgs() << ")\n");
// Windows unwind codes require consecutive registers if registers are
// paired. Make the switch here, so that the code below will save (x,x+1)
// and not (x+1,x).
unsigned FrameIdxReg1 = RPI.FrameIdx;
unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
if (NeedsWinCFI && RPI.isPaired()) {
std::swap(Reg1, Reg2);
std::swap(FrameIdxReg1, FrameIdxReg2);
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
if (RPI.isPaired()) {
MIB.addReg(Reg2, getDefRegState(true));
MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
MachineMemOperand::MOLoad, Size, Alignment));
MIB.addReg(Reg1, getDefRegState(true))
.addImm(RPI.Offset) // [sp, #offset*scale]
// where factor*scale is implicit
MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
MachineMemOperand::MOLoad, Size, Alignment));
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
// SVE objects are always restored in reverse order.
for (const RegPairInfo &RPI : reverse(RegPairs))
if (RPI.isScalable())
if (ReverseCSRRestoreSeq) {
for (const RegPairInfo &RPI : reverse(RegPairs))
if (!RPI.isScalable())
} else
for (const RegPairInfo &RPI : RegPairs)
if (!RPI.isScalable())
if (NeedShadowCallStackProlog) {
// Shadow call stack epilog: ldr x30, [x18, #-8]!
BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
.addReg(AArch64::X18, RegState::Define)
.addReg(AArch64::LR, RegState::Define)
return true;
void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
if (MF.getFunction().getCallingConv() == CallingConv::GHC)
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned UnspilledCSGPR = AArch64::NoRegister;
unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
MachineFrameInfo &MFI = MF.getFrameInfo();
const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
? RegInfo->getBaseRegister()
: (unsigned)AArch64::NoRegister;
unsigned ExtraCSSpill = 0;
// Figure out which callee-saved registers to save/restore.
for (unsigned i = 0; CSRegs[i]; ++i) {
const unsigned Reg = CSRegs[i];
// Add the base pointer register to SavedRegs if it is callee-save.
if (Reg == BasePointerReg)
bool RegUsed = SavedRegs.test(Reg);
unsigned PairedReg = AArch64::NoRegister;
if (AArch64::GPR64RegClass.contains(Reg) ||
AArch64::FPR64RegClass.contains(Reg) ||
PairedReg = CSRegs[i ^ 1];
if (!RegUsed) {
if (AArch64::GPR64RegClass.contains(Reg) &&
!RegInfo->isReservedReg(MF, Reg)) {
UnspilledCSGPR = Reg;
UnspilledCSGPRPaired = PairedReg;
// MachO's compact unwind format relies on all registers being stored in
// pairs.
// FIXME: the usual format is actually better if unwinding isn't needed.
if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
!SavedRegs.test(PairedReg)) {
if (AArch64::GPR64RegClass.contains(PairedReg) &&
!RegInfo->isReservedReg(MF, PairedReg))
ExtraCSSpill = PairedReg;
if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
!Subtarget.isTargetWindows()) {
// For Windows calling convention on a non-windows OS, where X18 is treated
// as reserved, back up X18 when entering non-windows code (marked with the
// Windows calling convention) and restore when returning regardless of
// whether the individual function uses it - it might call other functions
// that clobber it.
// Calculates the callee saved stack size.
unsigned CSStackSize = 0;
unsigned SVECSStackSize = 0;
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (unsigned Reg : SavedRegs.set_bits()) {
auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
if (AArch64::PPRRegClass.contains(Reg) ||
SVECSStackSize += RegSize;
CSStackSize += RegSize;
// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();
// The frame record needs to be created by saving the appropriate registers
uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
if (hasFP(MF) ||
windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
for (unsigned Reg
: SavedRegs.set_bits()) dbgs()
<< ' ' << printReg(Reg, RegInfo);
dbgs() << "\n";);
// If any callee-saved registers are used, the frame cannot be eliminated.
int64_t SVEStackSize =
alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
// The CSR spill slots have not been allocated yet, so estimateStackSize
// won't include them.
unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
// Conservatively always assume BigStack when there are SVE spills.
bool BigStack = SVEStackSize ||
(EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
// Estimate if we might need to scavenge a register at some point in order
// to materialize a stack offset. If so, either spill one additional
// callee-saved register or reserve a special spill slot to facilitate
// register scavenging. If we already spilled an extra callee-saved register
// above to keep the number of spills even, we don't need to do anything else
// here.
if (BigStack) {
if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
<< " to get a scratch register.\n");
// MachO's compact unwind format relies on all registers being stored in
// pairs, so if we need to spill one extra for BigStack, then we need to
// store the pair.
if (produceCompactUnwindFrame(MF))
ExtraCSSpill = UnspilledCSGPR;
// If we didn't find an extra callee-saved register to spill, create
// an emergency spill slot.
if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass &RC = AArch64::GPR64RegClass;
unsigned Size = TRI->getSpillSize(RC);
Align Alignment = TRI->getSpillAlign(RC);
int FI = MFI.CreateStackObject(Size, Alignment, false);
LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
<< " as the emergency spill slot.\n");
// Adding the size of additional 64bit GPR saves.
CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
<< EstimatedStackSize + AlignedCSStackSize
<< " bytes.\n");
assert((!MFI.isCalleeSavedInfoValid() ||
AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
"Should not invalidate callee saved info");
// Round up to register pair alignment to avoid additional SP adjustment
// instructions.
AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
bool AArch64FrameLowering::enableStackSlotScavenging(
const MachineFunction &MF) const {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
return AFI->hasCalleeSaveStackFreeSpace();
/// returns true if there are any SVE callee saves.
static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
int &Min, int &Max) {
Min = std::numeric_limits<int>::max();
Max = std::numeric_limits<int>::min();
if (!MFI.isCalleeSavedInfoValid())
return false;
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
for (auto &CS : CSI) {
if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
AArch64::PPRRegClass.contains(CS.getReg())) {
assert((Max == std::numeric_limits<int>::min() ||
Max + 1 == CS.getFrameIdx()) &&
"SVE CalleeSaves are not consecutive");
Min = std::min(Min, CS.getFrameIdx());
Max = std::max(Max, CS.getFrameIdx());
return Min != std::numeric_limits<int>::max();
// Process all the SVE stack objects and determine offsets for each
// object. If AssignOffsets is true, the offsets get assigned.
// Fills in the first and last callee-saved frame indices into
// Min/MaxCSFrameIndex, respectively.
// Returns the size of the stack.
static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
int &MinCSFrameIndex,
int &MaxCSFrameIndex,
bool AssignOffsets) {
+#ifndef NDEBUG
// First process all fixed stack objects.
- int64_t Offset = 0;
for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
- if (MFI.getStackID(I) == TargetStackID::SVEVector) {
- int64_t FixedOffset = -MFI.getObjectOffset(I);
- if (FixedOffset > Offset)
- Offset = FixedOffset;
- }
+ assert(MFI.getStackID(I) != TargetStackID::SVEVector &&
+ "SVE vectors should never be passed on the stack by value, only by "
+ "reference.");
auto Assign = [&MFI](int FI, int64_t Offset) {
LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
MFI.setObjectOffset(FI, Offset);
+ int64_t Offset = 0;
// Then process all callee saved slots.
if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
- // Make sure to align the last callee save slot.
- MFI.setObjectAlignment(MaxCSFrameIndex, Align(16));
// Assign offsets to the callee save slots.
for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
Offset += MFI.getObjectSize(I);
Offset = alignTo(Offset, MFI.getObjectAlign(I));
if (AssignOffsets)
Assign(I, -Offset);
+ // Ensure that the Callee-save area is aligned to 16bytes.
+ Offset = alignTo(Offset, Align(16U));
// Create a buffer of SVE objects to allocate and sort it.
SmallVector<int, 8> ObjectsToAllocate;
for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
unsigned StackID = MFI.getStackID(I);
if (StackID != TargetStackID::SVEVector)
if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
if (MFI.isDeadObjectIndex(I))
// Allocate all SVE locals and spills
for (unsigned FI : ObjectsToAllocate) {
Align Alignment = MFI.getObjectAlign(FI);
// FIXME: Given that the length of SVE vectors is not necessarily a power of
// two, we'd need to align every object dynamically at runtime if the
// alignment is larger than 16. This is not yet supported.
if (Alignment > Align(16))
"Alignment of scalable vectors > 16 bytes is not yet supported");
Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
if (AssignOffsets)
Assign(FI, -Offset);
return Offset;
int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
MachineFrameInfo &MFI) const {
int MinCSFrameIndex, MaxCSFrameIndex;
return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF, RegScavenger *RS) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
"Upwards growing stack unsupported");
int MinCSFrameIndex, MaxCSFrameIndex;
int64_t SVEStackSize =
assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
if (!MF.hasEHFunclets())
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
MachineBasicBlock &MBB = MF.front();
auto MBBI = MBB.begin();
while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
// Create an UnwindHelp object.
// The UnwindHelp object is allocated at the start of the fixed object area
int64_t FixedObject =
getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
/*SPOffset*/ -FixedObject,
EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
// We need to store -2 into the UnwindHelp object at the start of the
// function.
DebugLoc DL;
unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
assert(DstReg && "There must be a free register after frame setup");
BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
.addReg(DstReg, getKillRegState(true))
namespace {
struct TagStoreInstr {
MachineInstr *MI;
int64_t Offset, Size;
explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
: MI(MI), Offset(Offset), Size(Size) {}
class TagStoreEdit {
MachineFunction *MF;
MachineBasicBlock *MBB;
MachineRegisterInfo *MRI;
// Tag store instructions that are being replaced.
SmallVector<TagStoreInstr, 8> TagStores;
// Combined memref arguments of the above instructions.
SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
// Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
// FrameRegOffset + Size) with the address tag of SP.
Register FrameReg;
StackOffset FrameRegOffset;
int64_t Size;
// If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
Optional<int64_t> FrameRegUpdate;
// MIFlags for any FrameReg updating instructions.
unsigned FrameRegUpdateFlags;
// Use zeroing instruction variants.
bool ZeroData;
DebugLoc DL;
void emitUnrolled(MachineBasicBlock::iterator InsertI);
void emitLoop(MachineBasicBlock::iterator InsertI);
TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
: MBB(MBB), ZeroData(ZeroData) {
MF = MBB->getParent();
MRI = &MF->getRegInfo();
// Add an instruction to be replaced. Instructions must be added in the
// ascending order of Offset, and have to be adjacent.
void addInstruction(TagStoreInstr I) {
assert((TagStores.empty() ||
TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
"Non-adjacent tag store instructions.");
void clear() { TagStores.clear(); }
// Emit equivalent code at the given location, and erase the current set of
// instructions. May skip if the replacement is not profitable. May invalidate
// the input iterator and replace it with a valid one.
void emitCode(MachineBasicBlock::iterator &InsertI,
const AArch64FrameLowering *TFI, bool IsLast);
void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
const AArch64InstrInfo *TII =
const int64_t kMinOffset = -256 * 16;
const int64_t kMaxOffset = 255 * 16;
Register BaseReg = FrameReg;
int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
if (BaseRegOffsetBytes < kMinOffset ||
BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
{BaseRegOffsetBytes, MVT::i8}, TII);
BaseReg = ScratchReg;
BaseRegOffsetBytes = 0;
MachineInstr *LastI = nullptr;
while (Size) {
int64_t InstrSize = (Size > 16) ? 32 : 16;
unsigned Opcode =
InstrSize == 16
? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
: (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
.addImm(BaseRegOffsetBytes / 16)
// A store to [BaseReg, #0] should go last for an opportunity to fold the
// final SP adjustment in the epilogue.
if (BaseRegOffsetBytes == 0)
LastI = I;
BaseRegOffsetBytes += InstrSize;
Size -= InstrSize;
if (LastI)
MBB->splice(InsertI, MBB, LastI);
void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
const AArch64InstrInfo *TII =
Register BaseReg = FrameRegUpdate
? FrameReg
: MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
int64_t LoopSize = Size;
// If the loop size is not a multiple of 32, split off one 16-byte store at
// the end to fold BaseReg update into.
if (FrameRegUpdate && *FrameRegUpdate)
LoopSize -= LoopSize % 32;
MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
TII->get(ZeroData ? AArch64::STZGloop_wback
: AArch64::STGloop_wback))
if (FrameRegUpdate)
int64_t ExtraBaseRegUpdate =
FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
if (LoopSize < Size) {
assert(Size - LoopSize == 16);
// Tag 16 more bytes at BaseReg and update BaseReg.
BuildMI(*MBB, InsertI, DL,
TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
.addImm(1 + ExtraBaseRegUpdate / 16)
} else if (ExtraBaseRegUpdate) {
// Update BaseReg.
*MBB, InsertI, DL,
TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
// Check if *II is a register update that can be merged into STGloop that ends
// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
// end of the loop.
bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
int64_t Size, int64_t *TotalOffset) {
MachineInstr &MI = *II;
if ((MI.getOpcode() == AArch64::ADDXri ||
MI.getOpcode() == AArch64::SUBXri) &&
MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
int64_t Offset = MI.getOperand(2).getImm() << Shift;
if (MI.getOpcode() == AArch64::SUBXri)
Offset = -Offset;
int64_t AbsPostOffset = std::abs(Offset - Size);
const int64_t kMaxOffset =
0xFFF; // Max encoding for unshifted ADDXri / SUBXri
if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
*TotalOffset = Offset;
return true;
return false;
void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
SmallVectorImpl<MachineMemOperand *> &MemRefs) {
for (auto &TS : TSE) {
MachineInstr *MI = TS.MI;
// An instruction without memory operands may access anything. Be
// conservative and return an empty list.
if (MI->memoperands_empty()) {
MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
const AArch64FrameLowering *TFI, bool IsLast) {
if (TagStores.empty())
TagStoreInstr &FirstTagStore = TagStores[0];
TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
DL = TagStores[0].MI->getDebugLoc();
Register Reg;
FrameRegOffset = TFI->resolveFrameOffsetReference(
*MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
/*PreferFP=*/false, /*ForSimm=*/true);
FrameReg = Reg;
FrameRegUpdate = None;
mergeMemRefs(TagStores, CombinedMemRefs);
LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
for (const auto &Instr
: TagStores) { dbgs() << " " << *Instr.MI; });
// Size threshold where a loop becomes shorter than a linear sequence of
// tagging instructions.
const int kSetTagLoopThreshold = 176;
if (Size < kSetTagLoopThreshold) {
if (TagStores.size() < 2)
} else {
MachineInstr *UpdateInstr = nullptr;
int64_t TotalOffset;
if (IsLast) {
// See if we can merge base register update into the STGloop.
// This is done in AArch64LoadStoreOptimizer for "normal" stores,
// but STGloop is way too unusual for that, and also it only
// realistically happens in function epilogue. Also, STGloop is expanded
// before that pass.
if (InsertI != MBB->end() &&
canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
&TotalOffset)) {
UpdateInstr = &*InsertI++;
LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
<< *UpdateInstr);
if (!UpdateInstr && TagStores.size() < 2)
if (UpdateInstr) {
FrameRegUpdate = TotalOffset;
FrameRegUpdateFlags = UpdateInstr->getFlags();
if (UpdateInstr)
for (auto &TS : TagStores)
bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
int64_t &Size, bool &ZeroData) {
MachineFunction &MF = *MI.getParent()->getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Opcode = MI.getOpcode();
ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
Opcode == AArch64::STZ2GOffset);
if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
return false;
if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
return false;
Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
Size = MI.getOperand(2).getImm();
return true;
if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
Size = 16;
else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
Size = 32;
return false;
if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
return false;
Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
16 * MI.getOperand(2).getImm();
return true;
// Detect a run of memory tagging instructions for adjacent stack frame slots,
// and replace them with a shorter instruction sequence:
// * replace STG + STG with ST2G
// * replace STGloop + STGloop with STGloop
// This code needs to run when stack slot offsets are already known, but before
// FrameIndex operands in STG instructions are eliminated.
MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
const AArch64FrameLowering *TFI,
RegScavenger *RS) {
bool FirstZeroData;
int64_t Size, Offset;
MachineInstr &MI = *II;
MachineBasicBlock *MBB = MI.getParent();
MachineBasicBlock::iterator NextI = ++II;
if (&MI == &MBB->instr_back())
return II;
if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
return II;
SmallVector<TagStoreInstr, 4> Instrs;
Instrs.emplace_back(&MI, Offset, Size);
constexpr int kScanLimit = 10;
int Count = 0;
for (MachineBasicBlock::iterator E = MBB->end();
NextI != E && Count < kScanLimit; ++NextI) {
MachineInstr &MI = *NextI;
bool ZeroData;
int64_t Size, Offset;
// Collect instructions that update memory tags with a FrameIndex operand
// and (when applicable) constant size, and whose output registers are dead
// (the latter is almost always the case in practice). Since these
// instructions effectively have no inputs or outputs, we are free to skip
// any non-aliasing instructions in between without tracking used registers.
if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
if (ZeroData != FirstZeroData)
Instrs.emplace_back(&MI, Offset, Size);
// Only count non-transient, non-tagging instructions toward the scan
// limit.
if (!MI.isTransient())
// Just in case, stop before the epilogue code starts.
if (MI.getFlag(MachineInstr::FrameSetup) ||
// Reject anything that may alias the collected instructions.
if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
// New code will be inserted after the last tagging instruction we've found.
MachineBasicBlock::iterator InsertI = Instrs.back().MI;
[](const TagStoreInstr &Left, const TagStoreInstr &Right) {
return Left.Offset < Right.Offset;
// Make sure that we don't have any overlapping stores.
int64_t CurOffset = Instrs[0].Offset;
for (auto &Instr : Instrs) {
if (CurOffset > Instr.Offset)
return NextI;
CurOffset = Instr.Offset + Instr.Size;
// Find contiguous runs of tagged memory and emit shorter instruction
// sequencies for them when possible.
TagStoreEdit TSE(MBB, FirstZeroData);
Optional<int64_t> EndOffset;
for (auto &Instr : Instrs) {
if (EndOffset && *EndOffset != Instr.Offset) {
// Found a gap.
TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
EndOffset = Instr.Offset + Instr.Size;
TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
return InsertI;
} // namespace
void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
MachineFunction &MF, RegScavenger *RS = nullptr) const {
if (StackTaggingMergeSetTag)
for (auto &BB : MF)
for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
II = tryMergeAdjacentSTG(II, this, RS);
/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
/// before the update. This is easily retrieved as it is exactly the offset
/// that is set in processFunctionBeforeFrameFinalized.
int AArch64FrameLowering::getFrameIndexReferencePreferSP(
const MachineFunction &MF, int FI, Register &FrameReg,
bool IgnoreSPUpdates) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
if (IgnoreSPUpdates) {
LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
<< MFI.getObjectOffset(FI) << "\n");
FrameReg = AArch64::SP;
return MFI.getObjectOffset(FI);
return getFrameIndexReference(MF, FI, FrameReg);
/// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
/// the parent's frame pointer
unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
const MachineFunction &MF) const {
return 0;
/// Funclets only need to account for space for the callee saved registers,
/// as the locals are accounted for in the parent's stack frame.
unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
const MachineFunction &MF) const {
// This is the size of the pushed CSRs.
unsigned CSSize =
// This is the amount of stack a funclet needs to allocate.
return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 9d0a6d9eaf25..444740cb50ab 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -1,122 +1,128 @@
//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "AArch64StackOffset.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
class AArch64FrameLowering : public TargetFrameLowering {
explicit AArch64FrameLowering()
: TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
true /*StackRealignable*/) {}
emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const override;
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const override;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
Register &FrameReg) const override;
StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
Register &FrameReg, bool PreferFP,
bool ForSimm) const;
StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
int64_t ObjectOffset, bool isFixed,
bool isSVE, Register &FrameReg,
bool PreferFP, bool ForSimm) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;
restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
MutableArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;
/// Can this function use the red zone for local allocations.
bool canUseRedZone(const MachineFunction &MF) const;
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS) const override;
/// Returns true if the target will correctly handle shrink wrapping.
bool enableShrinkWrapping(const MachineFunction &MF) const override {
return true;
bool enableStackSlotScavenging(const MachineFunction &MF) const override;
TargetStackID::Value getStackIDForScalableVectors() const override;
void processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS) const override;
processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
RegScavenger *RS) const override;
unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
Register &FrameReg,
bool IgnoreSPUpdates) const override;
int getNonLocalFrameIndexReference(const MachineFunction &MF,
int FI) const override;
int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;
bool isSupportedStackID(TargetStackID::Value ID) const override {
switch (ID) {
return false;
case TargetStackID::Default:
case TargetStackID::SVEVector:
case TargetStackID::NoAlloc:
return true;
+ bool isStackIdSafeForLocalArea(unsigned StackId) const override {
+ // We don't support putting SVE objects into the pre-allocated local
+ // frame block at the moment.
+ return StackId != TargetStackID::SVEVector;
+ }
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
uint64_t StackBumpBytes) const;
int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const;
int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
int &MinCSFrameIndex,
int &MaxCSFrameIndex) const;
bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
unsigned StackBumpBytes) const;
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 10c477853353..7799ebfbd68e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1,4931 +1,4940 @@
//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file defines an instruction selector for the AArch64 target.
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Function.h" // To access function attributes.
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "aarch64-isel"
/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
/// instructions for SelectionDAG operations.
namespace {
class AArch64DAGToDAGISel : public SelectionDAGISel {
/// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget;
explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
CodeGenOpt::Level OptLevel)
: SelectionDAGISel(tm, OptLevel), Subtarget(nullptr) {}
StringRef getPassName() const override {
return "AArch64 Instruction Selection";
bool runOnMachineFunction(MachineFunction &MF) override {
Subtarget = &MF.getSubtarget<AArch64Subtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
void Select(SDNode *Node) override;
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
/// inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
template <signed Low, signed High, signed Scale>
bool SelectRDVLImm(SDValue N, SDValue &Imm);
bool tryMLAV64LaneV128(SDNode *N);
bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
return SelectShiftedRegister(N, false, Reg, Shift);
bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
return SelectShiftedRegister(N, true, Reg, Shift);
bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm);
bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm);
bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 1, Base, OffImm);
bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 2, Base, OffImm);
bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 4, Base, OffImm);
bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 8, Base, OffImm);
bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeIndexed(N, 16, Base, OffImm);
bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeUnscaled(N, 1, Base, OffImm);
bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeUnscaled(N, 2, Base, OffImm);
bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeUnscaled(N, 4, Base, OffImm);
bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeUnscaled(N, 8, Base, OffImm);
bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
return SelectAddrModeUnscaled(N, 16, Base, OffImm);
template<int Width>
bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
SDValue &SignExtend, SDValue &DoShift) {
return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
template<int Width>
bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
SDValue &SignExtend, SDValue &DoShift) {
return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
bool SelectDupZeroOrUndef(SDValue N) {
switch(N->getOpcode()) {
case ISD::UNDEF:
return true;
case AArch64ISD::DUP:
auto Opnd0 = N->getOperand(0);
if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
if (CN->isNullValue())
return true;
if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
if (CN->isZero())
return true;
return false;
bool SelectDupZero(SDValue N) {
switch(N->getOpcode()) {
case AArch64ISD::DUP:
auto Opnd0 = N->getOperand(0);
if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
if (CN->isNullValue())
return true;
if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
if (CN->isZero())
return true;
return false;
template<MVT::SimpleValueType VT>
bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
return SelectSVEAddSubImm(N, VT, Imm, Shift);
template<MVT::SimpleValueType VT>
bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
return SelectSVELogicalImm(N, VT, Imm);
template <unsigned Low, unsigned High>
bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
return SelectSVEShiftImm64(N, Low, High, Imm);
// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
template<signed Min, signed Max, signed Scale, bool Shift>
bool SelectCntImm(SDValue N, SDValue &Imm) {
if (!isa<ConstantSDNode>(N))
return false;
int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
if (Shift)
MulImm = 1LL << MulImm;
if ((MulImm % std::abs(Scale)) != 0)
return false;
MulImm /= Scale;
if ((MulImm >= Min) && (MulImm <= Max)) {
Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
return true;
return false;
/// Form sequences of consecutive 64/128-bit registers for use in NEON
/// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
/// between 1 and 4 elements. If it contains a single element that is returned
/// unchanged; otherwise a REG_SEQUENCE value is returned.
SDValue createDTuple(ArrayRef<SDValue> Vecs);
SDValue createQTuple(ArrayRef<SDValue> Vecs);
// Form a sequence of SVE registers for instructions using list of vectors,
// e.g. structured loads and stores (ldN, stN).
SDValue createZTuple(ArrayRef<SDValue> Vecs);
/// Generic helper for the createDTuple/createQTuple
/// functions. Those should almost always be called instead.
SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
const unsigned SubRegs[]);
void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
bool tryIndexedLoad(SDNode *N);
bool trySelectStackSlotTagP(SDNode *N);
void SelectTagP(SDNode *N);
void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
unsigned SubRegIdx);
void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
unsigned SubRegIdx);
void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
- void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc);
+ void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
+ unsigned Opc_rr, unsigned Opc_ri);
bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
/// SVE Reg+Imm addressing mode.
template <int64_t Min, int64_t Max>
bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
SDValue &OffImm);
/// SVE Reg+Reg address mode.
template <unsigned Scale>
bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
- template <unsigned Scale>
- void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr,
- const unsigned Opc_ri);
- template <unsigned Scale>
+ void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale,
+ unsigned Opc_rr, unsigned Opc_ri);
std::tuple<unsigned, SDValue, SDValue>
- findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
- const unsigned Opc_ri, const SDValue &OldBase,
- const SDValue &OldOffset);
+ findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri,
+ const SDValue &OldBase, const SDValue &OldOffset,
+ unsigned Scale);
bool tryBitfieldExtractOp(SDNode *N);
bool tryBitfieldExtractOpFromSExt(SDNode *N);
bool tryBitfieldInsertOp(SDNode *N);
bool tryBitfieldInsertInZeroOp(SDNode *N);
bool tryShiftAmountMod(SDNode *N);
bool tryHighFPExt(SDNode *N);
bool tryReadRegister(SDNode *N);
bool tryWriteRegister(SDNode *N);
// Include the pieces autogenerated from the target description.
#include ""
bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
SDValue &Shift);
bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm) {
return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm);
bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW,
unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
SDValue &OffImm);
bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
SDValue &Offset, SDValue &SignExtend,
SDValue &DoShift);
bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
SDValue &Offset, SDValue &SignExtend,
SDValue &DoShift);
bool isWorthFolding(SDValue V) const;
bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
SDValue &Offset, SDValue &SignExtend);
template<unsigned RegWidth>
bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
bool SelectCMP_SWAP(SDNode *N);
bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift);
bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
SDValue &Imm);
bool SelectSVEArithImm(SDValue N, SDValue &Imm);
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
SDValue &Offset);
} // end anonymous namespace
/// isIntImmediate - This method tests to see if the node is a constant
/// operand. If so Imm will receive the 32-bit value.
static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
Imm = C->getZExtValue();
return true;
return false;
// isIntImmediate - This method tests to see if a constant operand.
// If so Imm will receive the value.
static bool isIntImmediate(SDValue N, uint64_t &Imm) {
return isIntImmediate(N.getNode(), Imm);
// isOpcWithIntImmediate - This method tests to see if the node is a specific
// opcode and that it has a immediate integer right operand.
// If so Imm will receive the 32 bit value.
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
uint64_t &Imm) {
return N->getOpcode() == Opc &&
isIntImmediate(N->getOperand(1).getNode(), Imm);
bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
switch(ConstraintID) {
llvm_unreachable("Unexpected asm memory constraint");
case InlineAsm::Constraint_m:
case InlineAsm::Constraint_Q:
// We need to make sure that this one operand does not end up in XZR, thus
// require the address to be in a PointerRegClass register.
const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
SDLoc dl(Op);
SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
SDValue NewOp =
dl, Op.getValueType(),
Op, RC), 0);
return false;
return true;
/// SelectArithImmed - Select an immediate value that can be represented as
/// a 12-bit value shifted left by either 0 or 12. If so, return true with
/// Val set to the 12-bit value and Shift set to the shifter operand.
bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
SDValue &Shift) {
// This function is called from the addsub_shifted_imm ComplexPattern,
// which lists [imm] as the list of opcode it's interested in, however
// we still need to check whether the operand is actually an immediate
// here because the ComplexPattern opcode list is only used in
// root-level opcode matching.
if (!isa<ConstantSDNode>(N.getNode()))
return false;
uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
unsigned ShiftAmt;
if (Immed >> 12 == 0) {
ShiftAmt = 0;
} else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
ShiftAmt = 12;
Immed = Immed >> 12;
} else
return false;
unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
SDLoc dl(N);
Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
return true;
/// SelectNegArithImmed - As above, but negates the value before trying to
/// select it.
bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
SDValue &Shift) {
// This function is called from the addsub_shifted_imm ComplexPattern,
// which lists [imm] as the list of opcode it's interested in, however
// we still need to check whether the operand is actually an immediate
// here because the ComplexPattern opcode list is only used in
// root-level opcode matching.
if (!isa<ConstantSDNode>(N.getNode()))
return false;
// The immediate operand must be a 24-bit zero-extended immediate.
uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
// This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
// have the opposite effect on the C flag, so this pattern mustn't match under
// those circumstances.
if (Immed == 0)
return false;
if (N.getValueType() == MVT::i32)
Immed = ~((uint32_t)Immed) + 1;
Immed = ~Immed + 1ULL;
if (Immed & 0xFFFFFFFFFF000000ULL)
return false;
Immed &= 0xFFFFFFULL;
return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
/// getShiftTypeForNode - Translate a shift node to the corresponding
/// ShiftType value.
static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
switch (N.getOpcode()) {
return AArch64_AM::InvalidShiftExtend;
case ISD::SHL:
return AArch64_AM::LSL;
case ISD::SRL:
return AArch64_AM::LSR;
case ISD::SRA:
return AArch64_AM::ASR;
case ISD::ROTR:
return AArch64_AM::ROR;
/// Determine whether it is worth it to fold SHL into the addressing
/// mode.
static bool isWorthFoldingSHL(SDValue V) {
assert(V.getOpcode() == ISD::SHL && "invalid opcode");
// It is worth folding logical shift of up to three places.
auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
if (!CSD)
return false;
unsigned ShiftVal = CSD->getZExtValue();
if (ShiftVal > 3)
return false;
// Check if this particular node is reused in any non-memory related
// operation. If yes, do not try to fold this node into the address
// computation, since the computation will be kept.
const SDNode *Node = V.getNode();
for (SDNode *UI : Node->uses())
if (!isa<MemSDNode>(*UI))
for (SDNode *UII : UI->uses())
if (!isa<MemSDNode>(*UII))
return false;
return true;
/// Determine whether it is worth to fold V into an extended register.
bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
// Trivial if we are optimizing for code size or if there is only
// one use of the value.
if (CurDAG->shouldOptForSize() || V.hasOneUse())
return true;
// If a subtarget has a fastpath LSL we can fold a logical shift into
// the addressing mode and save a cycle.
if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
return true;
if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
const SDValue LHS = V.getOperand(0);
const SDValue RHS = V.getOperand(1);
if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
return true;
if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
return true;
// It hurts otherwise, since the value will be reused.
return false;
/// SelectShiftedRegister - Select a "shifted register" operand. If the value
/// is not shifted, set the Shift operand to default of "LSL 0". The logical
/// instructions allow the shifted register to be rotated, but the arithmetic
/// instructions do not. The AllowROR parameter specifies whether ROR is
/// supported.
bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
SDValue &Reg, SDValue &Shift) {
AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
if (ShType == AArch64_AM::InvalidShiftExtend)
return false;
if (!AllowROR && ShType == AArch64_AM::ROR)
return false;
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
unsigned BitSize = N.getValueSizeInBits();
unsigned Val = RHS->getZExtValue() & (BitSize - 1);
unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
Reg = N.getOperand(0);
Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
return isWorthFolding(N);
return false;
/// getExtendTypeForNode - Translate an extend node to the corresponding
/// ExtendType value.
static AArch64_AM::ShiftExtendType
getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
if (N.getOpcode() == ISD::SIGN_EXTEND ||
N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
SrcVT = N.getOperand(0).getValueType();
if (!IsLoadStore && SrcVT == MVT::i8)
return AArch64_AM::SXTB;
else if (!IsLoadStore && SrcVT == MVT::i16)
return AArch64_AM::SXTH;
else if (SrcVT == MVT::i32)
return AArch64_AM::SXTW;
assert(SrcVT != MVT::i64 && "extend from 64-bits?");
return AArch64_AM::InvalidShiftExtend;
} else if (N.getOpcode() == ISD::ZERO_EXTEND ||
N.getOpcode() == ISD::ANY_EXTEND) {
EVT SrcVT = N.getOperand(0).getValueType();
if (!IsLoadStore && SrcVT == MVT::i8)
return AArch64_AM::UXTB;
else if (!IsLoadStore && SrcVT == MVT::i16)
return AArch64_AM::UXTH;
else if (SrcVT == MVT::i32)
return AArch64_AM::UXTW;
assert(SrcVT != MVT::i64 && "extend from 64-bits?");
return AArch64_AM::InvalidShiftExtend;
} else if (N.getOpcode() == ISD::AND) {
ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
if (!CSD)
return AArch64_AM::InvalidShiftExtend;
uint64_t AndMask = CSD->getZExtValue();
switch (AndMask) {
return AArch64_AM::InvalidShiftExtend;
case 0xFF:
return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
case 0xFFFF:
return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
case 0xFFFFFFFF:
return AArch64_AM::UXTW;
return AArch64_AM::InvalidShiftExtend;
// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
DL->getOpcode() != AArch64ISD::DUPLANE32)
return false;
SDValue SV = DL->getOperand(0);
if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
return false;
SDValue EV = SV.getOperand(1);
if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;
ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
LaneOp = EV.getOperand(0);
return true;
// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
// high lane extract.
static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
SDValue &LaneOp, int &LaneIdx) {
if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
std::swap(Op0, Op1);
if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
return false;
StdOp = Op1;
return true;
/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
/// is a lane in the upper half of a 128-bit vector. Recognize and select this
/// so that we don't emit unnecessary lane extracts.
bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
SDLoc dl(N);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDValue MLAOp1; // Will hold ordinary multiplicand for MLA.
SDValue MLAOp2; // Will hold lane-accessed multiplicand for MLA.
int LaneIdx = -1; // Will hold the lane index.
if (Op1.getOpcode() != ISD::MUL ||
!checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
LaneIdx)) {
std::swap(Op0, Op1);
if (Op1.getOpcode() != ISD::MUL ||
!checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
return false;
SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
unsigned MLAOpc = ~0U;
switch (N->getSimpleValueType(0).SimpleTy) {
llvm_unreachable("Unrecognized MLA.");
case MVT::v4i16:
MLAOpc = AArch64::MLAv4i16_indexed;
case MVT::v8i16:
MLAOpc = AArch64::MLAv8i16_indexed;
case MVT::v2i32:
MLAOpc = AArch64::MLAv2i32_indexed;
case MVT::v4i32:
MLAOpc = AArch64::MLAv4i32_indexed;
ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
return true;
bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
SDLoc dl(N);
int LaneIdx;
if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
return false;
SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
unsigned SMULLOpc = ~0U;
if (IntNo == Intrinsic::aarch64_neon_smull) {
switch (N->getSimpleValueType(0).SimpleTy) {
llvm_unreachable("Unrecognized SMULL.");
case MVT::v4i32:
SMULLOpc = AArch64::SMULLv4i16_indexed;
case MVT::v2i64:
SMULLOpc = AArch64::SMULLv2i32_indexed;
} else if (IntNo == Intrinsic::aarch64_neon_umull) {
switch (N->getSimpleValueType(0).SimpleTy) {
llvm_unreachable("Unrecognized SMULL.");
case MVT::v4i32:
SMULLOpc = AArch64::UMULLv4i16_indexed;
case MVT::v2i64:
SMULLOpc = AArch64::UMULLv2i32_indexed;
} else
llvm_unreachable("Unrecognized intrinsic.");
ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
return true;
/// Instructions that accept extend modifiers like UXTW expect the register
/// being extended to be a GPR32, but the incoming DAG might be acting on a
/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
/// this is the case.
static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
if (N.getValueType() == MVT::i32)
return N;
SDLoc dl(N);
SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
dl, MVT::i32, N, SubReg);
return SDValue(Node, 0);
// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
template<signed Low, signed High, signed Scale>
bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
if (!isa<ConstantSDNode>(N))
return false;
int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
if ((MulImm % std::abs(Scale)) == 0) {
int64_t RDVLImm = MulImm / Scale;
if ((RDVLImm >= Low) && (RDVLImm <= High)) {
Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32);
return true;
return false;
/// SelectArithExtendedRegister - Select a "extended register" operand. This
/// operand folds in an extend followed by an optional left shift.
bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
SDValue &Shift) {
unsigned ShiftVal = 0;
AArch64_AM::ShiftExtendType Ext;
if (N.getOpcode() == ISD::SHL) {
ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
if (!CSD)
return false;
ShiftVal = CSD->getZExtValue();
if (ShiftVal > 4)
return false;
Ext = getExtendTypeForNode(N.getOperand(0));
if (Ext == AArch64_AM::InvalidShiftExtend)
return false;
Reg = N.getOperand(0).getOperand(0);
} else {
Ext = getExtendTypeForNode(N);
if (Ext == AArch64_AM::InvalidShiftExtend)
return false;
Reg = N.getOperand(0);
// Don't match if free 32-bit -> 64-bit zext can be used instead.
if (Ext == AArch64_AM::UXTW &&
Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
return false;
// AArch64 mandates that the RHS of the operation must use the smallest
// register class that could contain the size being extended from. Thus,
// if we're folding a (sext i8), we need the RHS to be a GPR32, even though
// there might not be an actual 32-bit value in the program. We can
// (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
Reg = narrowIfNeeded(CurDAG, Reg);
Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
return isWorthFolding(N);
/// If there's a use of this ADDlow that's not itself a load/store then we'll
/// need to create a real ADD instruction from it anyway and there's no point in
/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
/// leads to duplicated ADRP instructions.
static bool isWorthFoldingADDlow(SDValue N) {
for (auto Use : N->uses()) {
if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
Use->getOpcode() != ISD::ATOMIC_LOAD &&
Use->getOpcode() != ISD::ATOMIC_STORE)
return false;
// ldar and stlr have much more restrictive addressing modes (just a
// register).
if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
return false;
return true;
/// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.
bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm,
unsigned BW, unsigned Size,
SDValue &Base,
SDValue &OffImm) {
SDLoc dl(N);
const DataLayout &DL = CurDAG->getDataLayout();
const TargetLowering *TLI = getTargetLowering();
if (N.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(N)->getIndex();
Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
return true;
// As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed
// selected here doesn't support labels/immediates, only base+offset.
if (CurDAG->isBaseWithConstantOffset(N)) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
if (IsSignedImm) {
int64_t RHSC = RHS->getSExtValue();
unsigned Scale = Log2_32(Size);
int64_t Range = 0x1LL << (BW - 1);
if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) &&
RHSC < (Range << Scale)) {
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
return true;
} else {
// unsigned Immediate
uint64_t RHSC = RHS->getZExtValue();
unsigned Scale = Log2_32(Size);
uint64_t Range = 0x1ULL << BW;
if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) {
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
return true;
// Base only. The address will be materialized into a register before
// the memory is accessed.
// add x0, Xbase, #offset
// stp x1, x2, [x0]
Base = N;
OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
return true;
/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.
bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
SDValue &Base, SDValue &OffImm) {
SDLoc dl(N);
const DataLayout &DL = CurDAG->getDataLayout();
const TargetLowering *TLI = getTargetLowering();
if (N.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(N)->getIndex();
Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
return true;
if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
GlobalAddressSDNode *GAN =
Base = N.getOperand(0);
OffImm = N.getOperand(1);
if (!GAN)
return true;
if (GAN->getOffset() % Size == 0 &&
GAN->getGlobal()->getPointerAlignment(DL) >= Size)
return true;
if (CurDAG->isBaseWithConstantOffset(N)) {
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
int64_t RHSC = (int64_t)RHS->getZExtValue();
unsigned Scale = Log2_32(Size);
if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
return true;
// Before falling back to our general case, check if the unscaled
// instructions can handle this. If so, that's preferable.
if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
return false;
// Base only. The address will be materialized into a register before
// the memory is accessed.
// add x0, Xbase, #offset
// ldr x0, [x0]
Base = N;
OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
return true;
/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
/// immediate" address. This should only match when there is an offset that
/// is not valid for a scaled immediate addressing mode. The "Size" argument
/// is the size in bytes of the memory reference, which is needed here to know
/// what is valid for a scaled immediate.
bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
SDValue &Base,
SDValue &OffImm) {
if (!CurDAG->isBaseWithConstantOffset(N))
return false;
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
int64_t RHSC = RHS->getSExtValue();
// If the offset is valid as a scaled immediate, don't match here.
if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
RHSC < (0x1000 << Log2_32(Size)))
return false;
if (RHSC >= -256 && RHSC < 256) {
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
const TargetLowering *TLI = getTargetLowering();
Base = CurDAG->getTargetFrameIndex(
FI, TLI->getPointerTy(CurDAG->getDataLayout()));
OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
return true;
return false;
static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
SDLoc dl(N);
SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
SDValue ImpDef = SDValue(
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
MachineSDNode *Node = CurDAG->getMachineNode(
TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
return SDValue(Node, 0);
/// Check if the given SHL node (\p N), can be used to form an
/// extended register for an addressing mode.
bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
bool WantExtend, SDValue &Offset,
SDValue &SignExtend) {
assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
return false;
SDLoc dl(N);
if (WantExtend) {
AArch64_AM::ShiftExtendType Ext =
getExtendTypeForNode(N.getOperand(0), true);
if (Ext == AArch64_AM::InvalidShiftExtend)
return false;
Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
} else {
Offset = N.getOperand(0);
SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
unsigned LegalShiftVal = Log2_32(Size);
unsigned ShiftVal = CSD->getZExtValue();
if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
return false;
return isWorthFolding(N);
bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
SDValue &Base, SDValue &Offset,
SDValue &SignExtend,
SDValue &DoShift) {
if (N.getOpcode() != ISD::ADD)
return false;
SDValue LHS = N.getOperand(0);
SDValue RHS = N.getOperand(1);
SDLoc dl(N);
// We don't want to match immediate adds here, because they are better lowered
// to the register-immediate addressing modes.
if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
return false;
// Check if this particular node is reused in any non-memory related
// operation. If yes, do not try to fold this node into the address
// computation, since the computation will be kept.
const SDNode *Node = N.getNode();
for (SDNode *UI : Node->uses()) {
if (!isa<MemSDNode>(*UI))
return false;
// Remember if it is worth folding N when it produces extended register.
bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
// Try to match a shifted extend on the RHS.
if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
Base = LHS;
DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
return true;
// Try to match a shifted extend on the LHS.
if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
Base = RHS;
DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
return true;
// There was no shift, whatever else we find.
DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
// Try to match an unshifted extend on the LHS.
if (IsExtendedRegisterWorthFolding &&
(Ext = getExtendTypeForNode(LHS, true)) !=
AArch64_AM::InvalidShiftExtend) {
Base = RHS;
Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
if (isWorthFolding(LHS))
return true;
// Try to match an unshifted extend on the RHS.
if (IsExtendedRegisterWorthFolding &&
(Ext = getExtendTypeForNode(RHS, true)) !=
AArch64_AM::InvalidShiftExtend) {
Base = LHS;
Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
if (isWorthFolding(RHS))
return true;
return false;
// Check if the given immediate is preferred by ADD. If an immediate can be
// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
// encoded by one MOVZ, return true.
static bool isPreferredADD(int64_t ImmOff) {
// Constant in [0x0, 0xfff] can be encoded in ADD.
if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
return true;
// Check if it can be encoded in an "ADD LSL #12".
if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
// As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
(ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
return false;
bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
SDValue &Base, SDValue &Offset,
SDValue &SignExtend,
SDValue &DoShift) {
if (N.getOpcode() != ISD::ADD)
return false;
SDValue LHS = N.getOperand(0);
SDValue RHS = N.getOperand(1);
SDLoc DL(N);
// Check if this particular node is reused in any non-memory related
// operation. If yes, do not try to fold this node into the address
// computation, since the computation will be kept.
const SDNode *Node = N.getNode();
for (SDNode *UI : Node->uses()) {
if (!isa<MemSDNode>(*UI))
return false;
// Watch out if RHS is a wide immediate, it can not be selected into
// [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
// ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
// instructions like:
// MOV X0, WideImmediate
// ADD X1, BaseReg, X0
// LDR X2, [X1, 0]
// For such situation, using [BaseReg, XReg] addressing mode can save one
// MOV X0, WideImmediate
// LDR X2, [BaseReg, X0]
if (isa<ConstantSDNode>(RHS)) {
int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
unsigned Scale = Log2_32(Size);
// Skip the immediate can be selected by load/store addressing mode.
// Also skip the immediate can be encoded by a single ADD (SUB is also
// checked by using -ImmOff).
if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
return false;
SDValue Ops[] = { RHS };
SDNode *MOVI =
CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
SDValue MOVIV = SDValue(MOVI, 0);
// This ADD of two X register will be selected into [Reg+Reg] mode.
N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
// Remember if it is worth folding N when it produces extended register.
bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
// Try to match a shifted extend on the RHS.
if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
Base = LHS;
DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
return true;
// Try to match a shifted extend on the LHS.
if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
Base = RHS;
DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
return true;
// Match any non-shifted, non-extend, non-immediate add expression.
Base = LHS;
Offset = RHS;
SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
// Reg1 + Reg2 is free: no check needed.
return true;
SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
static const unsigned RegClassIDs[] = {
AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
AArch64::dsub2, AArch64::dsub3};
return createTuple(Regs, RegClassIDs, SubRegs);
SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
static const unsigned RegClassIDs[] = {
AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
AArch64::qsub2, AArch64::qsub3};
return createTuple(Regs, RegClassIDs, SubRegs);
SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef<SDValue> Regs) {
static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID,
static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
AArch64::zsub2, AArch64::zsub3};
return createTuple(Regs, RegClassIDs, SubRegs);
SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
const unsigned RegClassIDs[],
const unsigned SubRegs[]) {
// There's no special register-class for a vector-list of 1 element: it's just
// a vector.
if (Regs.size() == 1)
return Regs[0];
assert(Regs.size() >= 2 && Regs.size() <= 4);
SDLoc DL(Regs[0]);
SmallVector<SDValue, 4> Ops;
// First operand of REG_SEQUENCE is the desired RegClass.
CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
// Then we get pairs of source & subregister-position for the components.
for (unsigned i = 0; i < Regs.size(); ++i) {
Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
SDNode *N =
CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
return SDValue(N, 0);
void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
bool isExt) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
unsigned ExtOff = isExt;
// Form a REG_SEQUENCE to force register allocation.
unsigned Vec0Off = ExtOff + 1;
SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
N->op_begin() + Vec0Off + NumVecs);
SDValue RegSeq = createQTuple(Regs);
SmallVector<SDValue, 6> Ops;
if (isExt)
Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
LoadSDNode *LD = cast<LoadSDNode>(N);
if (LD->isUnindexed())
return false;
EVT VT = LD->getMemoryVT();
EVT DstVT = N->getValueType(0);
ISD::MemIndexedMode AM = LD->getAddressingMode();
bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
// We're not doing validity checking here. That was done when checking
// if we should mark the load as indexed or not. We're just selecting
// the right instruction.
unsigned Opcode = 0;
ISD::LoadExtType ExtType = LD->getExtensionType();
bool InsertTo64 = false;
if (VT == MVT::i64)
Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
else if (VT == MVT::i32) {
if (ExtType == ISD::NON_EXTLOAD)
Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
else if (ExtType == ISD::SEXTLOAD)
Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
else {
Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
InsertTo64 = true;
// The result of the load is only i32. It's the subreg_to_reg that makes
// it into an i64.
DstVT = MVT::i32;
} else if (VT == MVT::i16) {
if (ExtType == ISD::SEXTLOAD) {
if (DstVT == MVT::i64)
Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
} else {
Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
InsertTo64 = DstVT == MVT::i64;
// The result of the load is only i32. It's the subreg_to_reg that makes
// it into an i64.
DstVT = MVT::i32;
} else if (VT == MVT::i8) {
if (ExtType == ISD::SEXTLOAD) {
if (DstVT == MVT::i64)
Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
} else {
Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
InsertTo64 = DstVT == MVT::i64;
// The result of the load is only i32. It's the subreg_to_reg that makes
// it into an i64.
DstVT = MVT::i32;
} else if (VT == MVT::f16) {
Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
} else if (VT == MVT::bf16) {
Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
} else if (VT == MVT::f32) {
Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
} else if (VT == MVT::f64 || VT.is64BitVector()) {
Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
} else if (VT.is128BitVector()) {
Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
} else
return false;
SDValue Chain = LD->getChain();
SDValue Base = LD->getBasePtr();
ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
int OffsetVal = (int)OffsetOp->getZExtValue();
SDLoc dl(N);
SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
SDValue Ops[] = { Base, Offset, Chain };
SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
MVT::Other, Ops);
// Either way, we're replacing the node, so tell the caller that.
SDValue LoadedVal = SDValue(Res, 1);
if (InsertTo64) {
SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
LoadedVal =
AArch64::SUBREG_TO_REG, dl, MVT::i64,
CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
ReplaceUses(SDValue(N, 0), LoadedVal);
ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
return true;
void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
unsigned SubRegIdx) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue Chain = N->getOperand(0);
SDValue Ops[] = {N->getOperand(2), // Mem operand;
const EVT ResTys[] = {MVT::Untyped, MVT::Other};
SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
SDValue SuperReg = SDValue(Ld, 0);
for (unsigned i = 0; i < NumVecs; ++i)
ReplaceUses(SDValue(N, i),
CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
// Transfer memoperands.
MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
unsigned Opc, unsigned SubRegIdx) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue Chain = N->getOperand(0);
SDValue Ops[] = {N->getOperand(1), // Mem operand
N->getOperand(2), // Incremental
const EVT ResTys[] = {MVT::i64, // Type of the write back register
MVT::Untyped, MVT::Other};
SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
// Update uses of write back register
ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
// Update uses of vector list
SDValue SuperReg = SDValue(Ld, 1);
if (NumVecs == 1)
ReplaceUses(SDValue(N, 0), SuperReg);
for (unsigned i = 0; i < NumVecs; ++i)
ReplaceUses(SDValue(N, i),
CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
// Update the chain
ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
/// Optimize \param OldBase and \param OldOffset selecting the best addressing
/// mode. Returns a tuple consisting of an Opcode, an SDValue representing the
/// new Base and an SDValue representing the new offset.
-template <unsigned Scale>
std::tuple<unsigned, SDValue, SDValue>
-AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
- const unsigned Opc_ri,
+AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
+ unsigned Opc_ri,
const SDValue &OldBase,
- const SDValue &OldOffset) {
+ const SDValue &OldOffset,
+ unsigned Scale) {
SDValue NewBase = OldBase;
SDValue NewOffset = OldOffset;
// Detect a possible Reg+Imm addressing mode.
const bool IsRegImm = SelectAddrModeIndexedSVE</*Min=*/-8, /*Max=*/7>(
N, OldBase, NewBase, NewOffset);
// Detect a possible reg+reg addressing mode, but only if we haven't already
// detected a Reg+Imm one.
const bool IsRegReg =
- !IsRegImm && SelectSVERegRegAddrMode<Scale>(OldBase, NewBase, NewOffset);
+ !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset);
// Select the instruction.
return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
- const unsigned Opc) {
+ unsigned Scale, unsigned Opc_ri,
+ unsigned Opc_rr) {
+ assert(Scale < 4 && "Invalid scaling value.");
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue Chain = N->getOperand(0);
+ // Optimize addressing mode.
+ SDValue Base, Offset;
+ unsigned Opc;
+ std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
+ N, Opc_rr, Opc_ri, N->getOperand(2),
+ CurDAG->getTargetConstant(0, DL, MVT::i64), Scale);
SDValue Ops[] = {N->getOperand(1), // Predicate
- N->getOperand(2), // Memory operand
- CurDAG->getTargetConstant(0, DL, MVT::i64), Chain};
+ Base, // Memory operand
+ Offset, Chain};
const EVT ResTys[] = {MVT::Untyped, MVT::Other};
SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
SDValue SuperReg = SDValue(Load, 0);
for (unsigned i = 0; i < NumVecs; ++i)
ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
AArch64::zsub0 + i, DL, VT, SuperReg));
// Copy chain
unsigned ChainIdx = NumVecs;
ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getOperand(2)->getValueType(0);
// Form a REG_SEQUENCE to force register allocation.
bool Is128Bit = VT.getSizeInBits() == 128;
SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
// Transfer memoperands.
MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
ReplaceNode(N, St);
-template <unsigned Scale>
void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
- const unsigned Opc_rr,
- const unsigned Opc_ri) {
+ unsigned Scale, unsigned Opc_rr,
+ unsigned Opc_ri) {
SDLoc dl(N);
// Form a REG_SEQUENCE to force register allocation.
SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
SDValue RegSeq = createZTuple(Regs);
// Optimize addressing mode.
unsigned Opc;
SDValue Offset, Base;
- std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore<Scale>(
+ std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3),
- CurDAG->getTargetConstant(0, dl, MVT::i64));
+ CurDAG->getTargetConstant(0, dl, MVT::i64), Scale);
SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate
Base, // address
Offset, // offset
N->getOperand(0)}; // chain
SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
ReplaceNode(N, St);
bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base,
SDValue &OffImm) {
SDLoc dl(N);
const DataLayout &DL = CurDAG->getDataLayout();
const TargetLowering *TLI = getTargetLowering();
// Try to match it for the frame address
if (auto FINode = dyn_cast<FrameIndexSDNode>(N)) {
int FI = FINode->getIndex();
Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
return true;
return false;
void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getOperand(2)->getValueType(0);
const EVT ResTys[] = {MVT::i64, // Type of the write back register
MVT::Other}; // Type for the Chain
// Form a REG_SEQUENCE to force register allocation.
bool Is128Bit = VT.getSizeInBits() == 128;
SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
SDValue Ops[] = {RegSeq,
N->getOperand(NumVecs + 1), // base register
N->getOperand(NumVecs + 2), // Incremental
N->getOperand(0)}; // Chain
SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
ReplaceNode(N, St);
namespace {
/// WidenVector - Given a value in the V64 register class, produce the
/// equivalent value in the V128 register class.
class WidenVector {
SelectionDAG &DAG;
WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
SDValue operator()(SDValue V64Reg) {
EVT VT = V64Reg.getValueType();
unsigned NarrowSize = VT.getVectorNumElements();
MVT EltTy = VT.getVectorElementType().getSimpleVT();
MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
SDLoc DL(V64Reg);
SDValue Undef =
SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
} // namespace
/// NarrowVector - Given a value in the V128 register class, produce the
/// equivalent value in the V64 register class.
static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
EVT VT = V128Reg.getValueType();
unsigned WideSize = VT.getVectorNumElements();
MVT EltTy = VT.getVectorElementType().getSimpleVT();
MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
bool Narrow = VT.getSizeInBits() == 64;
// Form a REG_SEQUENCE to force register allocation.
SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
if (Narrow)
transform(Regs, Regs.begin(),
SDValue RegSeq = createQTuple(Regs);
const EVT ResTys[] = {MVT::Untyped, MVT::Other};
unsigned LaneNo =
cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
N->getOperand(NumVecs + 3), N->getOperand(0)};
SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
SDValue SuperReg = SDValue(Ld, 0);
EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
AArch64::qsub2, AArch64::qsub3 };
for (unsigned i = 0; i < NumVecs; ++i) {
SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
if (Narrow)
NV = NarrowVector(NV, *CurDAG);
ReplaceUses(SDValue(N, i), NV);
ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
bool Narrow = VT.getSizeInBits() == 64;
// Form a REG_SEQUENCE to force register allocation.
SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
if (Narrow)
transform(Regs, Regs.begin(),
SDValue RegSeq = createQTuple(Regs);
const EVT ResTys[] = {MVT::i64, // Type of the write back register
RegSeq->getValueType(0), MVT::Other};
unsigned LaneNo =
cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
SDValue Ops[] = {RegSeq,
CurDAG->getTargetConstant(LaneNo, dl,
MVT::i64), // Lane Number
N->getOperand(NumVecs + 2), // Base register
N->getOperand(NumVecs + 3), // Incremental
SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
// Update uses of the write back register
ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
// Update uses of the vector list
SDValue SuperReg = SDValue(Ld, 1);
if (NumVecs == 1) {
ReplaceUses(SDValue(N, 0),
Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
} else {
EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
AArch64::qsub2, AArch64::qsub3 };
for (unsigned i = 0; i < NumVecs; ++i) {
SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
if (Narrow)
NV = NarrowVector(NV, *CurDAG);
ReplaceUses(SDValue(N, i), NV);
// Update the Chain
ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getOperand(2)->getValueType(0);
bool Narrow = VT.getSizeInBits() == 64;
// Form a REG_SEQUENCE to force register allocation.
SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
if (Narrow)
transform(Regs, Regs.begin(),
SDValue RegSeq = createQTuple(Regs);
unsigned LaneNo =
cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
N->getOperand(NumVecs + 3), N->getOperand(0)};
SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
// Transfer memoperands.
MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
ReplaceNode(N, St);
void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
unsigned Opc) {
SDLoc dl(N);
EVT VT = N->getOperand(2)->getValueType(0);
bool Narrow = VT.getSizeInBits() == 64;
// Form a REG_SEQUENCE to force register allocation.
SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
if (Narrow)
transform(Regs, Regs.begin(),
SDValue RegSeq = createQTuple(Regs);
const EVT ResTys[] = {MVT::i64, // Type of the write back register
unsigned LaneNo =
cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
N->getOperand(NumVecs + 2), // Base Register
N->getOperand(NumVecs + 3), // Incremental
SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
// Transfer memoperands.
MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
ReplaceNode(N, St);
static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
unsigned &Opc, SDValue &Opd0,
unsigned &LSB, unsigned &MSB,
unsigned NumberOfIgnoredLowBits,
bool BiggerPattern) {
assert(N->getOpcode() == ISD::AND &&
"N must be a AND operation to call this function");
EVT VT = N->getValueType(0);
// Here we can test the type of VT and return false when the type does not
// match, but since it is done prior to that call in the current context
// we turned that into an assert to avoid redundant code.
assert((VT == MVT::i32 || VT == MVT::i64) &&
"Type checking must have been done before calling this function");
// FIXME: simplify-demanded-bits in DAGCombine will probably have
// changed the AND node to a 32-bit mask operation. We'll have to
// undo that as part of the transform here if we want to catch all
// the opportunities.
// Currently the NumberOfIgnoredLowBits argument helps to recover
// form these situations when matching bigger pattern (bitfield insert).
// For unsigned extracts, check for a shift right and mask
uint64_t AndImm = 0;
if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
return false;
const SDNode *Op0 = N->getOperand(0).getNode();
// Because of simplify-demanded-bits in DAGCombine, the mask may have been
// simplified. Try to undo that
AndImm |= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);
// The immediate is a mask of the low bits iff imm & (imm+1) == 0
if (AndImm & (AndImm + 1))
return false;
bool ClampMSB = false;
uint64_t SrlImm = 0;
// Handle the SRL + ANY_EXTEND case.
if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
// Extend the incoming operand of the SRL to 64-bit.
Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
// Make sure to clamp the MSB so that we preserve the semantics of the
// original operations.
ClampMSB = true;
} else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
SrlImm)) {
// If the shift result was truncated, we can still combine them.
Opd0 = Op0->getOperand(0).getOperand(0);
// Use the type of SRL node.
VT = Opd0->getValueType(0);
} else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
Opd0 = Op0->getOperand(0);
} else if (BiggerPattern) {
// Let's pretend a 0 shift right has been performed.
// The resulting code will be at least as good as the original one
// plus it may expose more opportunities for bitfield insert pattern.
// FIXME: Currently we limit this to the bigger pattern, because
// some optimizations expect AND and not UBFM.
Opd0 = N->getOperand(0);
} else
return false;
// Bail out on large immediates. This happens when no proper
// combining/constant folding was performed.
if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
(dbgs() << N
<< ": Found large shift immediate, this should not happen\n"));
return false;
LSB = SrlImm;
MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
: countTrailingOnes<uint64_t>(AndImm)) -
if (ClampMSB)
// Since we're moving the extend before the right shift operation, we need
// to clamp the MSB to make sure we don't shift in undefined bits instead of
// the zeros which would get shifted in with the original right shift
// operation.
MSB = MSB > 31 ? 31 : MSB;
Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
return true;
static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
SDValue &Opd0, unsigned &Immr,
unsigned &Imms) {
assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
EVT VT = N->getValueType(0);
unsigned BitWidth = VT.getSizeInBits();
assert((VT == MVT::i32 || VT == MVT::i64) &&
"Type checking must have been done before calling this function");
SDValue Op = N->getOperand(0);
if (Op->getOpcode() == ISD::TRUNCATE) {
Op = Op->getOperand(0);
VT = Op->getValueType(0);
BitWidth = VT.getSizeInBits();
uint64_t ShiftImm;
if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
return false;
unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
if (ShiftImm + Width > BitWidth)
return false;
Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
Opd0 = Op.getOperand(0);
Immr = ShiftImm;
Imms = ShiftImm + Width - 1;
return true;
static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
SDValue &Opd0, unsigned &LSB,
unsigned &MSB) {
// We are looking for the following pattern which basically extracts several
// continuous bits from the source value and places it from the LSB of the
// destination value, all other bits of the destination value or set to zero:
// Value2 = AND Value, MaskImm
// SRL Value2, ShiftImm
// with MaskImm >> ShiftImm to search for the bit width.
// This gets selected into a single UBFM:
// UBFM Value, ShiftImm, BitWide + SrlImm -1
if (N->getOpcode() != ISD::SRL)
return false;
uint64_t AndMask = 0;
if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
return false;
Opd0 = N->getOperand(0).getOperand(0);
uint64_t SrlImm = 0;
if (!isIntImmediate(N->getOperand(1), SrlImm))
return false;
// Check whether we really have several bits extract here.
unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
if (BitWide && isMask_64(AndMask >> SrlImm)) {
if (N->getValueType(0) == MVT::i32)
Opc = AArch64::UBFMWri;
Opc = AArch64::UBFMXri;
LSB = SrlImm;
MSB = BitWide + SrlImm - 1;
return true;
return false;
static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
unsigned &Immr, unsigned &Imms,
bool BiggerPattern) {
assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
"N must be a SHR/SRA operation to call this function");
EVT VT = N->getValueType(0);
// Here we can test the type of VT and return false when the type does not
// match, but since it is done prior to that call in the current context
// we turned that into an assert to avoid redundant code.
assert((VT == MVT::i32 || VT == MVT::i64) &&
"Type checking must have been done before calling this function");
// Check for AND + SRL doing several bits extract.
if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
return true;
// We're looking for a shift of a shift.
uint64_t ShlImm = 0;
uint64_t TruncBits = 0;
if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
Opd0 = N->getOperand(0).getOperand(0);
} else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
// We are looking for a shift of truncate. Truncate from i64 to i32 could
// be considered as setting high 32 bits as zero. Our strategy here is to
// always generate 64bit UBFM. This consistency will help the CSE pass
// later find more redundancy.
Opd0 = N->getOperand(0).getOperand(0);
TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
VT = Opd0.getValueType();
assert(VT == MVT::i64 && "the promoted type should be i64");
} else if (BiggerPattern) {
// Let's pretend a 0 shift left has been performed.
// FIXME: Currently we limit this to the bigger pattern case,
// because some optimizations expect AND and not UBFM
Opd0 = N->getOperand(0);
} else
return false;
// Missing combines/constant folding may have left us with strange
// constants.
if (ShlImm >= VT.getSizeInBits()) {
(dbgs() << N
<< ": Found large shift immediate, this should not happen\n"));
return false;
uint64_t SrlImm = 0;
if (!isIntImmediate(N->getOperand(1), SrlImm))
return false;
assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
"bad amount in shift node!");
int immr = SrlImm - ShlImm;
Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
// SRA requires a signed extraction
if (VT == MVT::i32)
Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
return true;
bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
assert(N->getOpcode() == ISD::SIGN_EXTEND);
EVT VT = N->getValueType(0);
EVT NarrowVT = N->getOperand(0)->getValueType(0);
if (VT != MVT::i64 || NarrowVT != MVT::i32)
return false;
uint64_t ShiftImm;
SDValue Op = N->getOperand(0);
if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
return false;
SDLoc dl(N);
// Extend the incoming operand of the shift to 64-bits.
SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
unsigned Immr = ShiftImm;
unsigned Imms = NarrowVT.getSizeInBits() - 1;
SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
CurDAG->getTargetConstant(Imms, dl, VT)};
CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
return true;
/// Try to form fcvtl2 instructions from a floating-point extend of a high-half
/// extract of a subvector.
bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
assert(N->getOpcode() == ISD::FP_EXTEND);
// There are 2 forms of fcvtl2 - extend to double or extend to float.
SDValue Extract = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT NarrowVT = Extract.getValueType();
if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) &&
(VT != MVT::v4f32 || NarrowVT != MVT::v4f16))
return false;
// Optionally look past a bitcast.
Extract = peekThroughBitcasts(Extract);
if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;
// Match extract from start of high half index.
// Example: v8i16 -> v4i16 means the extract must begin at index 4.
unsigned ExtractIndex = Extract.getConstantOperandVal(1);
if (ExtractIndex != Extract.getValueType().getVectorNumElements())
return false;
auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
return true;
static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
SDValue &Opd0, unsigned &Immr, unsigned &Imms,
unsigned NumberOfIgnoredLowBits = 0,
bool BiggerPattern = false) {
if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
return false;
switch (N->getOpcode()) {
if (!N->isMachineOpcode())
return false;
case ISD::AND:
return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
NumberOfIgnoredLowBits, BiggerPattern);
case ISD::SRL:
case ISD::SRA:
return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
unsigned NOpc = N->getMachineOpcode();
switch (NOpc) {
return false;
case AArch64::SBFMWri:
case AArch64::UBFMWri:
case AArch64::SBFMXri:
case AArch64::UBFMXri:
Opc = NOpc;
Opd0 = N->getOperand(0);
Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
return true;
// Unreachable
return false;
bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
unsigned Opc, Immr, Imms;
SDValue Opd0;
if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
return false;
EVT VT = N->getValueType(0);
SDLoc dl(N);
// If the bit extract operation is 64bit but the original type is 32bit, we
// need to add one EXTRACT_SUBREG.
if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
MVT::i32, SDValue(BFM, 0), SubReg));
return true;
SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
CurDAG->getTargetConstant(Imms, dl, VT)};
CurDAG->SelectNodeTo(N, Opc, VT, Ops);
return true;
/// Does DstMask form a complementary pair with the mask provided by
/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
/// this asks whether DstMask zeroes precisely those bits that will be set by
/// the other half.
static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
unsigned NumberOfIgnoredHighBits, EVT VT) {
assert((VT == MVT::i32 || VT == MVT::i64) &&
"i32 or i64 mask type expected!");
unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
APInt SignificantDstMask = APInt(BitWidth, DstMask);
APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
(SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
// Look for bits that will be useful for later uses.
// A bit is consider useless as soon as it is dropped and never used
// before it as been dropped.
// E.g., looking for useful bit of x
// 1. y = x & 0x7
// 2. z = y >> 2
// After #1, x useful bits are 0x7, then the useful bits of x, live through
// y.
// After #2, the useful bits of x are 0x4.
// However, if x is used on an unpredicatable instruction, then all its bits
// are useful.
// E.g.
// 1. y = x & 0x7
// 2. z = y >> 2
// 3. str x, [@x]
static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
unsigned Depth) {
uint64_t Imm =
cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
getUsefulBits(Op, UsefulBits, Depth + 1);
static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
uint64_t Imm, uint64_t MSB,
unsigned Depth) {
// inherit the bitwidth value
APInt OpUsefulBits(UsefulBits);
OpUsefulBits = 1;
if (MSB >= Imm) {
OpUsefulBits <<= MSB - Imm + 1;
// The interesting part will be in the lower part of the result
getUsefulBits(Op, OpUsefulBits, Depth + 1);
// The interesting part was starting at Imm in the argument
OpUsefulBits <<= Imm;
} else {
OpUsefulBits <<= MSB + 1;
// The interesting part will be shifted in the result
OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
getUsefulBits(Op, OpUsefulBits, Depth + 1);
// The interesting part was at zero in the argument
OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
UsefulBits &= OpUsefulBits;
static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
unsigned Depth) {
uint64_t Imm =
cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
uint64_t MSB =
cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
unsigned Depth) {
uint64_t ShiftTypeAndValue =
cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
APInt Mask(UsefulBits);
if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
// Shift Left
uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
Mask <<= ShiftAmt;
getUsefulBits(Op, Mask, Depth + 1);
} else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
// Shift Right
// We do not handle AArch64_AM::ASR, because the sign will change the
// number of useful bits
uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
getUsefulBits(Op, Mask, Depth + 1);
Mask <<= ShiftAmt;
} else
UsefulBits &= Mask;
static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
unsigned Depth) {
uint64_t Imm =
cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
uint64_t MSB =
cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
APInt OpUsefulBits(UsefulBits);
OpUsefulBits = 1;
APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
APInt Mask(UsefulBits.getBitWidth(), 0);
getUsefulBits(Op, ResultUsefulBits, Depth + 1);
if (MSB >= Imm) {
// The instruction is a BFXIL.
uint64_t Width = MSB - Imm + 1;
uint64_t LSB = Imm;
OpUsefulBits <<= Width;
if (Op.getOperand(1) == Orig) {
// Copy the low bits from the result to bits starting from LSB.
Mask = ResultUsefulBits & OpUsefulBits;
Mask <<= LSB;
if (Op.getOperand(0) == Orig)
// Bits starting from LSB in the input contribute to the result.
Mask |= (ResultUsefulBits & ~OpUsefulBits);
} else {
// The instruction is a BFI.
uint64_t Width = MSB + 1;
uint64_t LSB = UsefulBits.getBitWidth() - Imm;
OpUsefulBits <<= Width;
OpUsefulBits <<= LSB;
if (Op.getOperand(1) == Orig) {
// Copy the bits from the result to the zero bits.
Mask = ResultUsefulBits & OpUsefulBits;
if (Op.getOperand(0) == Orig)
Mask |= (ResultUsefulBits & ~OpUsefulBits);
UsefulBits &= Mask;
static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
SDValue Orig, unsigned Depth) {
// Users of this node should have already been instruction selected
// FIXME: Can we turn that into an assert?
if (!UserNode->isMachineOpcode())
switch (UserNode->getMachineOpcode()) {
case AArch64::ANDSWri:
case AArch64::ANDSXri:
case AArch64::ANDWri:
case AArch64::ANDXri:
// We increment Depth only when we call the getUsefulBits
return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
case AArch64::UBFMWri:
case AArch64::UBFMXri:
return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
case AArch64::ORRWrs:
case AArch64::ORRXrs:
if (UserNode->getOperand(1) != Orig)
return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
case AArch64::BFMWri:
case AArch64::BFMXri:
return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
case AArch64::STRBBui:
case AArch64::STURBBi:
if (UserNode->getOperand(0) != Orig)
UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
case AArch64::STRHHui:
case AArch64::STURHHi:
if (UserNode->getOperand(0) != Orig)
UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
if (Depth >= SelectionDAG::MaxRecursionDepth)
// Initialize UsefulBits
if (!Depth) {
unsigned Bitwidth = Op.getScalarValueSizeInBits();
// At the beginning, assume every produced bits is useful
UsefulBits = APInt(Bitwidth, 0);
APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
for (SDNode *Node : Op.getNode()->uses()) {
// A use cannot produce useful bits
APInt UsefulBitsForUse = APInt(UsefulBits);
getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
UsersUsefulBits |= UsefulBitsForUse;
// UsefulBits contains the produced bits that are meaningful for the
// current definition, thus a user cannot make a bit meaningful at
// this point
UsefulBits &= UsersUsefulBits;
/// Create a machine node performing a notional SHL of Op by ShlAmount. If
/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
/// 0, return Op unchanged.
static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
if (ShlAmount == 0)
return Op;
EVT VT = Op.getValueType();
SDLoc dl(Op);
unsigned BitWidth = VT.getSizeInBits();
unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
SDNode *ShiftNode;
if (ShlAmount > 0) {
// LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
ShiftNode = CurDAG->getMachineNode(
UBFMOpc, dl, VT, Op,
CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
} else {
// LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
assert(ShlAmount < 0 && "expected right shift");
int ShrAmount = -ShlAmount;
ShiftNode = CurDAG->getMachineNode(
UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
return SDValue(ShiftNode, 0);
/// Does this tree qualify as an attempt to move a bitfield into position,
/// essentially "(and (shl VAL, N), Mask)".
static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
bool BiggerPattern,
SDValue &Src, int &ShiftAmount,
int &MaskWidth) {
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
assert(BitWidth == 32 || BitWidth == 64);
KnownBits Known = CurDAG->computeKnownBits(Op);
// Non-zero in the sense that they're not provably zero, which is the key
// point if we want to use this value
uint64_t NonZeroBits = (~Known.Zero).getZExtValue();
// Discard a constant AND mask if present. It's safe because the node will
// already have been factored into the computeKnownBits calculation above.
uint64_t AndImm;
if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0);
Op = Op.getOperand(0);
// Don't match if the SHL has more than one use, since then we'll end up
// generating SHL+UBFIZ instead of just keeping SHL+AND.
if (!BiggerPattern && !Op.hasOneUse())
return false;
uint64_t ShlImm;
if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
return false;
Op = Op.getOperand(0);
if (!isShiftedMask_64(NonZeroBits))
return false;
ShiftAmount = countTrailingZeros(NonZeroBits);
MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
// BFI encompasses sufficiently many nodes that it's worth inserting an extra
// LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
// amount. BiggerPattern is true when this pattern is being matched for BFI,
// BiggerPattern is false when this pattern is being matched for UBFIZ, in
// which case it is not profitable to insert an extra shift.
if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
return false;
Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
return true;
static bool isShiftedMask(uint64_t Mask, EVT VT) {
assert(VT == MVT::i32 || VT == MVT::i64);
if (VT == MVT::i32)
return isShiftedMask_32(Mask);
return isShiftedMask_64(Mask);
// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
// inserted only sets known zero bits.
static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
return false;
unsigned BitWidth = VT.getSizeInBits();
uint64_t OrImm;
if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
return false;
// Skip this transformation if the ORR immediate can be encoded in the ORR.
// Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
// performance neutral.
if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
return false;
uint64_t MaskImm;
SDValue And = N->getOperand(0);
// Must be a single use AND with an immediate operand.
if (!And.hasOneUse() ||
!isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
return false;
// Compute the Known Zero for the AND as this allows us to catch more general
// cases than just looking for AND with imm.
KnownBits Known = CurDAG->computeKnownBits(And);
// Non-zero in the sense that they're not provably zero, which is the key
// point if we want to use this value.
uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
// The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
return false;
// The bits being inserted must only set those bits that are known to be zero.
if ((OrImm & NotKnownZero) != 0) {
// FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
// currently handle this case.
return false;
// BFI/BFXIL dst, src, #lsb, #width.
int LSB = countTrailingOnes(NotKnownZero);
int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
// BFI/BFXIL is an alias of BFM, so translate to BFM operands.
unsigned ImmR = (BitWidth - LSB) % BitWidth;
unsigned ImmS = Width - 1;
// If we're creating a BFI instruction avoid cases where we need more
// instructions to materialize the BFI constant as compared to the original
// ORR. A BFXIL will use the same constant as the original ORR, so the code
// should be no worse in this case.
bool IsBFI = LSB != 0;
uint64_t BFIImm = OrImm >> LSB;
if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
// We have a BFI instruction and we know the constant can't be materialized
// with a ORR-immediate with the zero register.
unsigned OrChunks = 0, BFIChunks = 0;
for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
if (((OrImm >> Shift) & 0xFFFF) != 0)
if (((BFIImm >> Shift) & 0xFFFF) != 0)
if (BFIChunks > OrChunks)
return false;
// Materialize the constant to be inserted.
SDLoc DL(N);
unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
SDNode *MOVI = CurDAG->getMachineNode(
MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
// Create the BFI/BFXIL instruction.
SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
CurDAG->getTargetConstant(ImmR, DL, VT),
CurDAG->getTargetConstant(ImmS, DL, VT)};
unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
CurDAG->SelectNodeTo(N, Opc, VT, Ops);
return true;
static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
SelectionDAG *CurDAG) {
assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
return false;
unsigned BitWidth = VT.getSizeInBits();
// Because of simplify-demanded-bits in DAGCombine, involved masks may not
// have the expected shape. Try to undo that.
unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
// Given a OR operation, check if we have the following pattern
// ubfm c, b, imm, imm2 (or something that does the same jobs, see
// isBitfieldExtractOp)
// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
// countTrailingZeros(mask2) == imm2 - imm + 1
// f = d | c
// if yes, replace the OR instruction with:
// f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
// OR is commutative, check all combinations of operand order and values of
// BiggerPattern, i.e.
// Opd0, Opd1, BiggerPattern=false
// Opd1, Opd0, BiggerPattern=false
// Opd0, Opd1, BiggerPattern=true
// Opd1, Opd0, BiggerPattern=true
// Several of these combinations may match, so check with BiggerPattern=false
// first since that will produce better results by matching more instructions
// and/or inserting fewer extra instructions.
for (int I = 0; I < 4; ++I) {
SDValue Dst, Src;
unsigned ImmR, ImmS;
bool BiggerPattern = I / 2;
SDValue OrOpd0Val = N->getOperand(I % 2);
SDNode *OrOpd0 = OrOpd0Val.getNode();
SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
SDNode *OrOpd1 = OrOpd1Val.getNode();
unsigned BFXOpc;
int DstLSB, Width;
if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
NumberOfIgnoredLowBits, BiggerPattern)) {
// Check that the returned opcode is compatible with the pattern,
// i.e., same type and zero extended (U and not S)
if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
(BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
// Compute the width of the bitfield insertion
DstLSB = 0;
Width = ImmS - ImmR + 1;
// FIXME: This constraint is to catch bitfield insertion we may
// want to widen the pattern if we want to grab general bitfied
// move case
if (Width <= 0)
// If the mask on the insertee is correct, we have a BFXIL operation. We
// can share the ImmR and ImmS values from the already-computed UBFM.
} else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
Src, DstLSB, Width)) {
ImmR = (BitWidth - DstLSB) % BitWidth;
ImmS = Width - 1;
} else
// Check the second part of the pattern
EVT VT = OrOpd1Val.getValueType();
assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
// Compute the Known Zero for the candidate of the first operand.
// This allows to catch more general case than just looking for
// AND with imm. Indeed, simplify-demanded-bits may have removed
// the AND instruction because it proves it was useless.
KnownBits Known = CurDAG->computeKnownBits(OrOpd1Val);
// Check if there is enough room for the second operand to appear
// in the first one
APInt BitsToBeInserted =
APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);
if ((BitsToBeInserted & ~Known.Zero) != 0)
// Set the first operand
uint64_t Imm;
if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
// In that case, we can eliminate the AND
Dst = OrOpd1->getOperand(0);
// Maybe the AND has been removed by simplify-demanded-bits
// or is useful because it discards more bits
Dst = OrOpd1Val;
// both parts match
SDLoc DL(N);
SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
CurDAG->getTargetConstant(ImmS, DL, VT)};
unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
CurDAG->SelectNodeTo(N, Opc, VT, Ops);
return true;
// Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
// Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
// mask (e.g., 0x000ffff0).
uint64_t Mask0Imm, Mask1Imm;
SDValue And0 = N->getOperand(0);
SDValue And1 = N->getOperand(1);
if (And0.hasOneUse() && And1.hasOneUse() &&
isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
(isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
// ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
// (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
// bits to be inserted.
if (isShiftedMask(Mask0Imm, VT)) {
std::swap(And0, And1);
std::swap(Mask0Imm, Mask1Imm);
SDValue Src = And1->getOperand(0);
SDValue Dst = And0->getOperand(0);
unsigned LSB = countTrailingZeros(Mask1Imm);
int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
// The BFXIL inserts the low-order bits from a source register, so right
// shift the needed bits into place.
SDLoc DL(N);
unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
SDNode *LSR = CurDAG->getMachineNode(
ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
// BFXIL is an alias of BFM, so translate to BFM operands.
unsigned ImmR = (BitWidth - LSB) % BitWidth;
unsigned ImmS = Width - 1;
// Create the BFXIL instruction.
SDValue Ops[] = {Dst, SDValue(LSR, 0),
CurDAG->getTargetConstant(ImmR, DL, VT),
CurDAG->getTargetConstant(ImmS, DL, VT)};
unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
CurDAG->SelectNodeTo(N, Opc, VT, Ops);
return true;
return false;
bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
if (N->getOpcode() != ISD::OR)
return false;
APInt NUsefulBits;
getUsefulBits(SDValue(N, 0), NUsefulBits);
// If all bits are not useful, just return UNDEF.
if (!NUsefulBits) {
CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
return true;
if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
return true;
return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
/// equivalent of a left shift by a constant amount followed by an and masking
/// out a contiguous set of bits.
bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
if (N->getOpcode() != ISD::AND)
return false;
EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
return false;
SDValue Op0;
int DstLSB, Width;
if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
Op0, DstLSB, Width))
return false;
// ImmR is the rotate right amount.
unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
// ImmS is the most significant bit of the source to be moved.
unsigned ImmS = Width - 1;
SDLoc DL(N);
SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
CurDAG->getTargetConstant(ImmS, DL, VT)};
unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
CurDAG->SelectNodeTo(N, Opc, VT, Ops);
return true;
/// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
/// variable shift/rotate instructions.
bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
EVT VT = N->getValueType(0);
unsigned Opc;
switch (N->getOpcode()) {
case ISD::ROTR:
Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
case ISD::SHL:
Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
case ISD::SRL:
Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
case ISD::SRA:
Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
return false;
uint64_t Size;
uint64_t Bits;
if (VT == MVT::i32) {
Bits = 5;
Size = 32;
} else if (VT == MVT::i64) {
Bits = 6;
Size = 64;
} else
return false;
SDValue ShiftAmt = N->getOperand(1);
SDLoc DL(N);
SDValue NewShiftAmt;
// Skip over an extend of the shift amount.
if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND ||
ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
ShiftAmt = ShiftAmt->getOperand(0);
if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
SDValue Add0 = ShiftAmt->getOperand(0);
SDValue Add1 = ShiftAmt->getOperand(1);
uint64_t Add0Imm;
uint64_t Add1Imm;
// If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
// to avoid the ADD/SUB.
if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
NewShiftAmt = Add0;
// If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
// generate a NEG instead of a SUB of a constant.
else if (ShiftAmt->getOpcode() == ISD::SUB &&
isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
(Add0Imm % Size == 0)) {
unsigned NegOpc;
unsigned ZeroReg;
EVT SubVT = ShiftAmt->getValueType(0);
if (SubVT == MVT::i32) {
NegOpc = AArch64::SUBWrr;
ZeroReg = AArch64::WZR;
} else {
assert(SubVT == MVT::i64);
NegOpc = AArch64::SUBXrr;
ZeroReg = AArch64::XZR;
SDValue Zero =
CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
MachineSDNode *Neg =
CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
NewShiftAmt = SDValue(Neg, 0);
} else
return false;
} else {
// If the shift amount is masked with an AND, check that the mask covers the
// bits that are implicitly ANDed off by the above opcodes and if so, skip
// the AND.
uint64_t MaskImm;
if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
!isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
return false;
if (countTrailingOnes(MaskImm) < Bits)
return false;
NewShiftAmt = ShiftAmt->getOperand(0);
// Narrow/widen the shift amount to match the size of the shift operation.
if (VT == MVT::i32)
NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
MachineSDNode *Ext = CurDAG->getMachineNode(
CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
NewShiftAmt = SDValue(Ext, 0);
SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
CurDAG->SelectNodeTo(N, Opc, VT, Ops);
return true;
AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
unsigned RegWidth) {
APFloat FVal(0.0);
if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
FVal = CN->getValueAPF();
else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
// Some otherwise illegal constants are allowed in this case.
if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
return false;
ConstantPoolSDNode *CN =
FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
} else
return false;
// An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
// is between 1 and 32 for a destination w-register, or 1 and 64 for an
// x-register.
// By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
// want THIS_NODE to be 2^fbits. This is much easier to deal with using
// integers.
bool IsExact;
// fbits is between 1 and 64 in the worst-case, which means the fmul
// could have 2^64 as an actual operand. Need 65 bits of precision.
APSInt IntVal(65, true);
FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
// N.b. isPowerOf2 also checks for > 0.
if (!IsExact || !IntVal.isPowerOf2()) return false;
unsigned FBits = IntVal.logBase2();
// Checks above should have guaranteed that we haven't lost information in
// finding FBits, but it must still be in range.
if (FBits == 0 || FBits > RegWidth) return false;
FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
return true;
// Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
// of the string and obtains the integer values from them and combines these
// into a single value to be used in the MRS/MSR instruction.
static int getIntOperandFromRegisterString(StringRef RegString) {
SmallVector<StringRef, 5> Fields;
RegString.split(Fields, ':');
if (Fields.size() == 1)
return -1;
assert(Fields.size() == 5
&& "Invalid number of fields in read register string");
SmallVector<int, 5> Ops;
bool AllIntFields = true;
for (StringRef Field : Fields) {
unsigned IntField;
AllIntFields &= !Field.getAsInteger(10, IntField);
assert(AllIntFields &&
"Unexpected non-integer value in special register string.");
// Need to combine the integer fields of the string into a single value
// based on the bit encoding of MRS/MSR instruction.
return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
(Ops[3] << 3) | (Ops[4]);
// Lower the read_register intrinsic to an MRS instruction node if the special
// register string argument is either of the form detailed in the ALCE (the
// form described in getIntOperandsFromRegsterString) or is a named register
// known by the MRS SysReg mapper.
bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
SDLoc DL(N);
int Reg = getIntOperandFromRegisterString(RegString->getString());
if (Reg != -1) {
ReplaceNode(N, CurDAG->getMachineNode(
AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
CurDAG->getTargetConstant(Reg, DL, MVT::i32),
return true;
// Use the sysreg mapper to map the remaining possible strings to the
// value for the register to be used for the instruction operand.
auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
if (TheReg && TheReg->Readable &&
Reg = TheReg->Encoding;
Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
if (Reg != -1) {
ReplaceNode(N, CurDAG->getMachineNode(
AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
CurDAG->getTargetConstant(Reg, DL, MVT::i32),
return true;
if (RegString->getString() == "pc") {
ReplaceNode(N, CurDAG->getMachineNode(
AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other,
CurDAG->getTargetConstant(0, DL, MVT::i32),
return true;
return false;
// Lower the write_register intrinsic to an MSR instruction node if the special
// register string argument is either of the form detailed in the ALCE (the
// form described in getIntOperandsFromRegsterString) or is a named register
// known by the MSR SysReg mapper.
bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
SDLoc DL(N);
int Reg = getIntOperandFromRegisterString(RegString->getString());
if (Reg != -1) {
N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
CurDAG->getTargetConstant(Reg, DL, MVT::i32),
N->getOperand(2), N->getOperand(0)));
return true;
// Check if the register was one of those allowed as the pstatefield value in
// the MSR (immediate) instruction. To accept the values allowed in the
// pstatefield for the MSR (immediate) instruction, we also require that an
// immediate value has been provided as an argument, we know that this is
// the case as it has been ensured by semantic checking.
auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
if (PMapper) {
assert (isa<ConstantSDNode>(N->getOperand(2))
&& "Expected a constant integer expression.");
unsigned Reg = PMapper->Encoding;
uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
unsigned State;
if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO || Reg == AArch64PState::SSBS) {
assert(Immed < 2 && "Bad imm");
State = AArch64::MSRpstateImm1;
} else {
assert(Immed < 16 && "Bad imm");
State = AArch64::MSRpstateImm4;
ReplaceNode(N, CurDAG->getMachineNode(
State, DL, MVT::Other,
CurDAG->getTargetConstant(Reg, DL, MVT::i32),
CurDAG->getTargetConstant(Immed, DL, MVT::i16),
return true;
// Use the sysreg mapper to attempt to map the remaining possible strings
// to the value for the register to be used for the MSR (register)
// instruction operand.
auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
if (TheReg && TheReg->Writeable &&
Reg = TheReg->Encoding;
Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
if (Reg != -1) {
ReplaceNode(N, CurDAG->getMachineNode(
AArch64::MSR, DL, MVT::Other,
CurDAG->getTargetConstant(Reg, DL, MVT::i32),
N->getOperand(2), N->getOperand(0)));
return true;
return false;
/// We've got special pseudo-instructions for these
bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
unsigned Opcode;
EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
// Leave IR for LSE if subtarget supports it.
if (Subtarget->hasLSE()) return false;
if (MemTy == MVT::i8)
Opcode = AArch64::CMP_SWAP_8;
else if (MemTy == MVT::i16)
Opcode = AArch64::CMP_SWAP_16;
else if (MemTy == MVT::i32)
Opcode = AArch64::CMP_SWAP_32;
else if (MemTy == MVT::i64)
Opcode = AArch64::CMP_SWAP_64;
llvm_unreachable("Unknown AtomicCmpSwap type");
MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
SDNode *CmpSwap = CurDAG->getMachineNode(
Opcode, SDLoc(N),
CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
return true;
bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base,
SDValue &Offset) {
auto C = dyn_cast<ConstantSDNode>(N);
if (!C)
return false;
auto Ty = N->getValueType(0);
int64_t Imm = C->getSExtValue();
SDLoc DL(N);
if ((Imm >= -128) && (Imm <= 127)) {
Base = CurDAG->getTargetConstant(Imm, DL, Ty);
Offset = CurDAG->getTargetConstant(0, DL, Ty);
return true;
if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) {
Base = CurDAG->getTargetConstant(Imm/256, DL, Ty);
Offset = CurDAG->getTargetConstant(8, DL, Ty);
return true;
return false;
bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
const int64_t ImmVal = CNode->getZExtValue();
SDLoc DL(N);
switch (VT.SimpleTy) {
case MVT::i8:
if ((ImmVal & 0xFF) == ImmVal) {
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
return true;
case MVT::i16:
case MVT::i32:
case MVT::i64:
if ((ImmVal & 0xFF) == ImmVal) {
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
return true;
} else if ((ImmVal & 0xFF00) == ImmVal) {
Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32);
return true;
return false;
bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
int64_t ImmVal = CNode->getSExtValue();
SDLoc DL(N);
if (ImmVal >= -128 && ImmVal < 128) {
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
return true;
return false;
bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, SDValue &Imm) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
uint64_t ImmVal = CNode->getSExtValue();
SDLoc DL(N);
ImmVal = ImmVal & 0xFF;
if (ImmVal < 256) {
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
return true;
return false;
bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
uint64_t ImmVal = CNode->getZExtValue();
SDLoc DL(N);
// Shift mask depending on type size.
switch (VT.SimpleTy) {
case MVT::i8:
ImmVal &= 0xFF;
ImmVal |= ImmVal << 8;
ImmVal |= ImmVal << 16;
ImmVal |= ImmVal << 32;
case MVT::i16:
ImmVal &= 0xFFFF;
ImmVal |= ImmVal << 16;
ImmVal |= ImmVal << 32;
case MVT::i32:
ImmVal &= 0xFFFFFFFF;
ImmVal |= ImmVal << 32;
case MVT::i64:
llvm_unreachable("Unexpected type");
uint64_t encoding;
if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
return true;
return false;
// This method is only needed to "cast" i64s into i32s when the value
// is a valid shift which has been splatted into a vector with i64 elements.
// Every other type is fine in tablegen.
bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
uint64_t High, SDValue &Imm) {
if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
uint64_t ImmVal = CN->getZExtValue();
SDLoc DL(N);
if (ImmVal >= Low && ImmVal <= High) {
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
return true;
return false;
bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
// tagp(FrameIndex, IRGstack, tag_offset):
// since the offset between FrameIndex and IRGstack is a compile-time
// constant, this can be lowered to a single ADDG instruction.
if (!(isa<FrameIndexSDNode>(N->getOperand(1)))) {
return false;
SDValue IRG_SP = N->getOperand(2);
if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
Intrinsic::aarch64_irg_sp) {
return false;
const TargetLowering *TLI = getTargetLowering();
SDLoc DL(N);
int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
SDValue FiOp = CurDAG->getTargetFrameIndex(
FI, TLI->getPointerTy(CurDAG->getDataLayout()));
int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
SDNode *Out = CurDAG->getMachineNode(
AArch64::TAGPstack, DL, MVT::i64,
{FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2),
CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
ReplaceNode(N, Out);
return true;
void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
assert(isa<ConstantSDNode>(N->getOperand(3)) &&
"llvm.aarch64.tagp third argument must be an immediate");
if (trySelectStackSlotTagP(N))
// FIXME: above applies in any case when offset between Op1 and Op2 is a
// compile-time constant, not just for stack allocations.
// General case for unrelated pointers in Op1 and Op2.
SDLoc DL(N);
int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
{N->getOperand(1), N->getOperand(2)});
SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
{SDValue(N1, 0), N->getOperand(2)});
SDNode *N3 = CurDAG->getMachineNode(
AArch64::ADDG, DL, MVT::i64,
{SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64),
CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
ReplaceNode(N, N3);
// NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length
// vector types larger than NEON don't have a matching SubRegIndex.
static SDNode *extractSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
assert(V.getValueType().isScalableVector() &&
V.getValueType().getSizeInBits().getKnownMinSize() ==
AArch64::SVEBitsPerBlock &&
"Expected to extract from a packed scalable vector!");
assert(VT.isFixedLengthVector() &&
"Expected to extract a fixed length vector!");
SDLoc DL(V);
switch (VT.getSizeInBits()) {
case 64: {
auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
case 128: {
auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
default: {
auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
// NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length
// vector types larger than NEON don't have a matching SubRegIndex.
static SDNode *insertSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
assert(VT.isScalableVector() &&
VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock &&
"Expected to insert into a packed scalable vector!");
assert(V.getValueType().isFixedLengthVector() &&
"Expected to insert a fixed length vector!");
SDLoc DL(V);
switch (V.getValueType().getSizeInBits()) {
case 64: {
auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
SDValue(Container, 0), V, SubReg);
case 128: {
auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
SDValue(Container, 0), V, SubReg);
default: {
auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
void AArch64DAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we already have selected!
if (Node->isMachineOpcode()) {
LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
// Few custom selection stuff.
EVT VT = Node->getValueType(0);
switch (Node->getOpcode()) {
if (SelectCMP_SWAP(Node))
if (tryReadRegister(Node))
if (tryWriteRegister(Node))
case ISD::ADD:
if (tryMLAV64LaneV128(Node))
case ISD::LOAD: {
// Try to select as an indexed load. Fall through to normal processing
// if we can't.
if (tryIndexedLoad(Node))
case ISD::SRL:
case ISD::AND:
case ISD::SRA:
if (tryBitfieldExtractOp(Node))
if (tryBitfieldInsertInZeroOp(Node))
case ISD::ROTR:
case ISD::SHL:
if (tryShiftAmountMod(Node))
if (tryBitfieldExtractOpFromSExt(Node))
if (tryHighFPExt(Node))
case ISD::OR:
if (tryBitfieldInsertOp(Node))
// Bail when not a "cast" like extract_subvector.
if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue() != 0)
// Bail when normal isel can do the job.
EVT InVT = Node->getOperand(0).getValueType();
if (VT.isScalableVector() || InVT.isFixedLengthVector())
// NOTE: We can only get here when doing fixed length SVE code generation.
// We do manual selection because the types involved are not linked to real
// registers (despite being legal) and must be coerced into SVE registers.
// NOTE: If the above changes, be aware that selection will still not work
// because the td definition of extract_vector does not support extracting
// a fixed length vector from a scalable vector.
ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0)));
// Bail when not a "cast" like insert_subvector.
if (cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue() != 0)
if (!Node->getOperand(0).isUndef())
// Bail when normal isel should do the job.
EVT InVT = Node->getOperand(1).getValueType();
if (VT.isFixedLengthVector() || InVT.isScalableVector())
// NOTE: We can only get here when doing fixed length SVE code generation.
// We do manual selection because the types involved are not linked to real
// registers (despite being legal) and must be coerced into SVE registers.
// NOTE: If the above changes, be aware that selection will still not work
// because the td definition of insert_vector does not support inserting a
// fixed length vector into a scalable vector.
ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1)));
case ISD::Constant: {
// Materialize zero constants as copies from WZR/XZR. This allows
// the coalescer to propagate these into other instructions.
ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
if (ConstNode->isNullValue()) {
if (VT == MVT::i32) {
SDValue New = CurDAG->getCopyFromReg(
CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
ReplaceNode(Node, New.getNode());
} else if (VT == MVT::i64) {
SDValue New = CurDAG->getCopyFromReg(
CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
ReplaceNode(Node, New.getNode());
case ISD::FrameIndex: {
// Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
int FI = cast<FrameIndexSDNode>(Node)->getIndex();
unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
const TargetLowering *TLI = getTargetLowering();
SDValue TFI = CurDAG->getTargetFrameIndex(
FI, TLI->getPointerTy(CurDAG->getDataLayout()));
SDLoc DL(Node);
SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
switch (IntNo) {
case Intrinsic::aarch64_ldaxp:
case Intrinsic::aarch64_ldxp: {
unsigned Op =
IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
SDValue MemAddr = Node->getOperand(2);
SDLoc DL(Node);
SDValue Chain = Node->getOperand(0);
SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
MVT::Other, MemAddr, Chain);
// Transfer memoperands.
MachineMemOperand *MemOp =
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
ReplaceNode(Node, Ld);
case Intrinsic::aarch64_stlxp:
case Intrinsic::aarch64_stxp: {
unsigned Op =
IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
SDLoc DL(Node);
SDValue Chain = Node->getOperand(0);
SDValue ValLo = Node->getOperand(2);
SDValue ValHi = Node->getOperand(3);
SDValue MemAddr = Node->getOperand(4);
// Place arguments in the right order.
SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
// Transfer memoperands.
MachineMemOperand *MemOp =
CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
ReplaceNode(Node, St);
case Intrinsic::aarch64_neon_ld1x2:
if (VT == MVT::v8i8) {
SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld1x3:
if (VT == MVT::v8i8) {
SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld1x4:
if (VT == MVT::v8i8) {
SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld2:
if (VT == MVT::v8i8) {
SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld3:
if (VT == MVT::v8i8) {
SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld4:
if (VT == MVT::v8i8) {
SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld2r:
if (VT == MVT::v8i8) {
SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld3r:
if (VT == MVT::v8i8) {
SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld4r:
if (VT == MVT::v8i8) {
SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
case Intrinsic::aarch64_neon_ld2lane:
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectLoadLane(Node, 2, AArch64::LD2i8);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectLoadLane(Node, 2, AArch64::LD2i16);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectLoadLane(Node, 2, AArch64::LD2i32);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectLoadLane(Node, 2, AArch64::LD2i64);
case Intrinsic::aarch64_neon_ld3lane:
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectLoadLane(Node, 3, AArch64::LD3i8);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectLoadLane(Node, 3, AArch64::LD3i16);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectLoadLane(Node, 3, AArch64::LD3i32);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectLoadLane(Node, 3, AArch64::LD3i64);
case Intrinsic::aarch64_neon_ld4lane:
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectLoadLane(Node, 4, AArch64::LD4i8);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectLoadLane(Node, 4, AArch64::LD4i16);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectLoadLane(Node, 4, AArch64::LD4i32);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectLoadLane(Node, 4, AArch64::LD4i64);
} break;
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
switch (IntNo) {
case Intrinsic::aarch64_tagp:
case Intrinsic::aarch64_neon_tbl2:
SelectTable(Node, 2,
VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
case Intrinsic::aarch64_neon_tbl3:
SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
: AArch64::TBLv16i8Three,
case Intrinsic::aarch64_neon_tbl4:
SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
: AArch64::TBLv16i8Four,
case Intrinsic::aarch64_neon_tbx2:
SelectTable(Node, 2,
VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
case Intrinsic::aarch64_neon_tbx3:
SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
: AArch64::TBXv16i8Three,
case Intrinsic::aarch64_neon_tbx4:
SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
: AArch64::TBXv16i8Four,
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull:
if (tryMULLV64LaneV128(IntNo, Node))
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
if (Node->getNumOperands() >= 3)
VT = Node->getOperand(2)->getValueType(0);
switch (IntNo) {
case Intrinsic::aarch64_neon_st1x2: {
if (VT == MVT::v8i8) {
SelectStore(Node, 2, AArch64::ST1Twov8b);
} else if (VT == MVT::v16i8) {
SelectStore(Node, 2, AArch64::ST1Twov16b);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v4bf16) {
SelectStore(Node, 2, AArch64::ST1Twov4h);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
VT == MVT::v8bf16) {
SelectStore(Node, 2, AArch64::ST1Twov8h);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectStore(Node, 2, AArch64::ST1Twov2s);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectStore(Node, 2, AArch64::ST1Twov4s);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectStore(Node, 2, AArch64::ST1Twov2d);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectStore(Node, 2, AArch64::ST1Twov1d);
case Intrinsic::aarch64_neon_st1x3: {
if (VT == MVT::v8i8) {
SelectStore(Node, 3, AArch64::ST1Threev8b);
} else if (VT == MVT::v16i8) {
SelectStore(Node, 3, AArch64::ST1Threev16b);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v4bf16) {
SelectStore(Node, 3, AArch64::ST1Threev4h);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
VT == MVT::v8bf16) {
SelectStore(Node, 3, AArch64::ST1Threev8h);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectStore(Node, 3, AArch64::ST1Threev2s);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectStore(Node, 3, AArch64::ST1Threev4s);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectStore(Node, 3, AArch64::ST1Threev2d);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectStore(Node, 3, AArch64::ST1Threev1d);
case Intrinsic::aarch64_neon_st1x4: {
if (VT == MVT::v8i8) {
SelectStore(Node, 4, AArch64::ST1Fourv8b);
} else if (VT == MVT::v16i8) {
SelectStore(Node, 4, AArch64::ST1Fourv16b);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v4bf16) {
SelectStore(Node, 4, AArch64::ST1Fourv4h);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
VT == MVT::v8bf16) {
SelectStore(Node, 4, AArch64::ST1Fourv8h);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectStore(Node, 4, AArch64::ST1Fourv2s);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectStore(Node, 4, AArch64::ST1Fourv4s);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectStore(Node, 4, AArch64::ST1Fourv2d);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectStore(Node, 4, AArch64::ST1Fourv1d);
case Intrinsic::aarch64_neon_st2: {
if (VT == MVT::v8i8) {
SelectStore(Node, 2, AArch64::ST2Twov8b);
} else if (VT == MVT::v16i8) {
SelectStore(Node, 2, AArch64::ST2Twov16b);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v4bf16) {
SelectStore(Node, 2, AArch64::ST2Twov4h);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
VT == MVT::v8bf16) {
SelectStore(Node, 2, AArch64::ST2Twov8h);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectStore(Node, 2, AArch64::ST2Twov2s);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectStore(Node, 2, AArch64::ST2Twov4s);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectStore(Node, 2, AArch64::ST2Twov2d);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectStore(Node, 2, AArch64::ST1Twov1d);
case Intrinsic::aarch64_neon_st3: {
if (VT == MVT::v8i8) {
SelectStore(Node, 3, AArch64::ST3Threev8b);
} else if (VT == MVT::v16i8) {
SelectStore(Node, 3, AArch64::ST3Threev16b);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v4bf16) {
SelectStore(Node, 3, AArch64::ST3Threev4h);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
VT == MVT::v8bf16) {
SelectStore(Node, 3, AArch64::ST3Threev8h);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectStore(Node, 3, AArch64::ST3Threev2s);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectStore(Node, 3, AArch64::ST3Threev4s);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectStore(Node, 3, AArch64::ST3Threev2d);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectStore(Node, 3, AArch64::ST1Threev1d);
case Intrinsic::aarch64_neon_st4: {
if (VT == MVT::v8i8) {
SelectStore(Node, 4, AArch64::ST4Fourv8b);
} else if (VT == MVT::v16i8) {
SelectStore(Node, 4, AArch64::ST4Fourv16b);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v4bf16) {
SelectStore(Node, 4, AArch64::ST4Fourv4h);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
VT == MVT::v8bf16) {
SelectStore(Node, 4, AArch64::ST4Fourv8h);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectStore(Node, 4, AArch64::ST4Fourv2s);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectStore(Node, 4, AArch64::ST4Fourv4s);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectStore(Node, 4, AArch64::ST4Fourv2d);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectStore(Node, 4, AArch64::ST1Fourv1d);
case Intrinsic::aarch64_neon_st2lane: {
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectStoreLane(Node, 2, AArch64::ST2i8);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectStoreLane(Node, 2, AArch64::ST2i16);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectStoreLane(Node, 2, AArch64::ST2i32);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectStoreLane(Node, 2, AArch64::ST2i64);
case Intrinsic::aarch64_neon_st3lane: {
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectStoreLane(Node, 3, AArch64::ST3i8);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectStoreLane(Node, 3, AArch64::ST3i16);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectStoreLane(Node, 3, AArch64::ST3i32);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectStoreLane(Node, 3, AArch64::ST3i64);
case Intrinsic::aarch64_neon_st4lane: {
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectStoreLane(Node, 4, AArch64::ST4i8);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectStoreLane(Node, 4, AArch64::ST4i16);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectStoreLane(Node, 4, AArch64::ST4i32);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectStoreLane(Node, 4, AArch64::ST4i64);
case Intrinsic::aarch64_sve_st2: {
if (VT == MVT::nxv16i8) {
- SelectPredicatedStore</*Scale=*/0>(Node, 2, AArch64::ST2B,
- AArch64::ST2B_IMM);
+ SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
- SelectPredicatedStore</*Scale=*/1>(Node, 2, AArch64::ST2H,
- AArch64::ST2H_IMM);
+ SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- SelectPredicatedStore</*Scale=*/2>(Node, 2, AArch64::ST2W,
- AArch64::ST2W_IMM);
+ SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM);
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- SelectPredicatedStore</*Scale=*/3>(Node, 2, AArch64::ST2D,
- AArch64::ST2D_IMM);
+ SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM);
case Intrinsic::aarch64_sve_st3: {
if (VT == MVT::nxv16i8) {
- SelectPredicatedStore</*Scale=*/0>(Node, 3, AArch64::ST3B,
- AArch64::ST3B_IMM);
+ SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
- SelectPredicatedStore</*Scale=*/1>(Node, 3, AArch64::ST3H,
- AArch64::ST3H_IMM);
+ SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- SelectPredicatedStore</*Scale=*/2>(Node, 3, AArch64::ST3W,
- AArch64::ST3W_IMM);
+ SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM);
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- SelectPredicatedStore</*Scale=*/3>(Node, 3, AArch64::ST3D,
- AArch64::ST3D_IMM);
+ SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM);
case Intrinsic::aarch64_sve_st4: {
if (VT == MVT::nxv16i8) {
- SelectPredicatedStore</*Scale=*/0>(Node, 4, AArch64::ST4B,
- AArch64::ST4B_IMM);
+ SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
- SelectPredicatedStore</*Scale=*/1>(Node, 4, AArch64::ST4H,
- AArch64::ST4H_IMM);
+ SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- SelectPredicatedStore</*Scale=*/2>(Node, 4, AArch64::ST4W,
- AArch64::ST4W_IMM);
+ SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM);
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- SelectPredicatedStore</*Scale=*/3>(Node, 4, AArch64::ST4D,
- AArch64::ST4D_IMM);
+ SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM);
case AArch64ISD::LD2post: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
case AArch64ISD::LD3post: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
case AArch64ISD::LD4post: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
case AArch64ISD::LD1x2post: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
case AArch64ISD::LD1x3post: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
case AArch64ISD::LD1x4post: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
case AArch64ISD::LD1DUPpost: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
case AArch64ISD::LD2DUPpost: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
case AArch64ISD::LD3DUPpost: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
case AArch64ISD::LD4DUPpost: {
if (VT == MVT::v8i8) {
SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
} else if (VT == MVT::v16i8) {
SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
case AArch64ISD::LD1LANEpost: {
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
case AArch64ISD::LD2LANEpost: {
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
case AArch64ISD::LD3LANEpost: {
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
case AArch64ISD::LD4LANEpost: {
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
case AArch64ISD::ST2post: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v8i8) {
SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
case AArch64ISD::ST3post: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v8i8) {
SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
case AArch64ISD::ST4post: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v8i8) {
SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
case AArch64ISD::ST1x2post: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v8i8) {
SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
case AArch64ISD::ST1x3post: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v8i8) {
SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) {
SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
case AArch64ISD::ST1x4post: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v8i8) {
SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
} else if (VT == MVT::v16i8) {
SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
} else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
} else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
} else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
case AArch64ISD::ST2LANEpost: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
case AArch64ISD::ST3LANEpost: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
case AArch64ISD::ST4LANEpost: {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v16i8 || VT == MVT::v8i8) {
SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
} else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
} else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32) {
SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
} else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
VT == MVT::v1f64) {
SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
case AArch64ISD::SVE_LD2_MERGE_ZERO: {
if (VT == MVT::nxv16i8) {
- SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
+ SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B);
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
- SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
+ SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H);
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM);
+ SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W);
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM);
+ SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D);
case AArch64ISD::SVE_LD3_MERGE_ZERO: {
if (VT == MVT::nxv16i8) {
- SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
+ SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B);
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
- SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
+ SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H);
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM);
+ SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W);
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM);
+ SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D);
case AArch64ISD::SVE_LD4_MERGE_ZERO: {
if (VT == MVT::nxv16i8) {
- SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
+ SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B);
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
(VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
- SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
+ SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H);
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
- SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM);
+ SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W);
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
- SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM);
+ SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D);
// Select the default instruction
/// createAArch64ISelDag - This pass converts a legalized DAG into a
/// AArch64-specific DAG, ready for instruction scheduling.
FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
CodeGenOpt::Level OptLevel) {
return new AArch64DAGToDAGISel(TM, OptLevel);
/// When \p PredVT is a scalable vector predicate in the form
/// MVT::nx<M>xi1, it builds the correspondent scalable vector of
-/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. If the input
+/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. When targeting
+/// structured vectors (NumVec >1), the output data type is
+/// MVT::nx<M*NumVec>xi<bits> s.t. M x bits = 128. If the input
/// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid
/// EVT.
-static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) {
+static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
+ unsigned NumVec) {
+ assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors.");
if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1)
return EVT();
if (PredVT != MVT::nxv16i1 && PredVT != MVT::nxv8i1 &&
PredVT != MVT::nxv4i1 && PredVT != MVT::nxv2i1)
return EVT();
ElementCount EC = PredVT.getVectorElementCount();
EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
- EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC);
+ EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec);
return MemVT;
/// Return the EVT of the data associated to a memory operation in \p
/// Root. If such EVT cannot be retrived, it returns an invalid EVT.
static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
if (isa<MemSDNode>(Root))
return cast<MemSDNode>(Root)->getMemoryVT();
if (isa<MemIntrinsicSDNode>(Root))
return cast<MemIntrinsicSDNode>(Root)->getMemoryVT();
const unsigned Opcode = Root->getOpcode();
// For custom ISD nodes, we have to look at them individually to extract the
// type of the data moved to/from memory.
switch (Opcode) {
case AArch64ISD::LD1_MERGE_ZERO:
return cast<VTSDNode>(Root->getOperand(3))->getVT();
case AArch64ISD::ST1_PRED:
return cast<VTSDNode>(Root->getOperand(4))->getVT();
+ case AArch64ISD::SVE_LD2_MERGE_ZERO:
+ return getPackedVectorTypeFromPredicateType(
+ Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/2);
+ case AArch64ISD::SVE_LD3_MERGE_ZERO:
+ return getPackedVectorTypeFromPredicateType(
+ Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/3);
+ case AArch64ISD::SVE_LD4_MERGE_ZERO:
+ return getPackedVectorTypeFromPredicateType(
+ Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/4);
if (Opcode != ISD::INTRINSIC_VOID)
return EVT();
const unsigned IntNo =
if (IntNo != Intrinsic::aarch64_sve_prf)
return EVT();
// We are using an SVE prefetch intrinsic. Type must be inferred
// from the width of the predicate.
return getPackedVectorTypeFromPredicateType(
- Ctx, Root->getOperand(2)->getValueType(0));
+ Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1);
/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max
/// where Root is the memory access using N for its address.
template <int64_t Min, int64_t Max>
bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
SDValue &Base,
SDValue &OffImm) {
const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
if (MemVT == EVT())
return false;
if (N.getOpcode() != ISD::ADD)
return false;
SDValue VScale = N.getOperand(1);
if (VScale.getOpcode() != ISD::VSCALE)
return false;
TypeSize TS = MemVT.getSizeInBits();
int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8;
int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
if ((MulImm % MemWidthBytes) != 0)
return false;
int64_t Offset = MulImm / MemWidthBytes;
if (Offset < Min || Offset > Max)
return false;
Base = N.getOperand(0);
OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
return true;
/// Select register plus register addressing mode for SVE, with scaled
/// offset.
bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
SDValue &Base,
SDValue &Offset) {
if (N.getOpcode() != ISD::ADD)
return false;
// Process an ADD node.
const SDValue LHS = N.getOperand(0);
const SDValue RHS = N.getOperand(1);
// 8 bit data does not come with the SHL node, so it is treated
// separately.
if (Scale == 0) {
Base = LHS;
Offset = RHS;
return true;
// Check if the RHS is a shift node with a constant.
if (RHS.getOpcode() != ISD::SHL)
return false;
const SDValue ShiftRHS = RHS.getOperand(1);
if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS))
if (C->getZExtValue() == Scale) {
Base = LHS;
Offset = RHS.getOperand(0);
return true;
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 85db14ab66fe..1500da2fdfc7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1,15156 +1,15177 @@
//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file implements the AArch64TargetLowering class.
#include "AArch64ISelLowering.h"
#include "AArch64CallingConvention.h"
#include "AArch64ExpandImm.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64PerfectShuffle.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <limits>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64-lower"
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
// FIXME: The necessary dtprel relocations don't seem to be supported
// well in the GNU bfd and gold linkers at the moment. Therefore, by
// default, for now, fall back to GeneralDynamic code generation.
cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
"aarch64-elf-ldtls-generation", cl::Hidden,
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
static cl::opt<bool>
EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
cl::desc("Enable AArch64 logical imm instruction "
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
/// Returns true if VT's elements occupy the lowest bit positions of its
/// associated register class without any intervening space.
/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
/// same register class, but only nxv8f16 can be treated as a packed vector.
static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
"Expected legal vector type!");
return VT.isFixedLengthVector() ||
VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
// we have to make something up. Arbitrarily, choose ZeroOrOne.
// When comparing vectors the result sets the different elements in the
// vector to all-one or all-zero.
// Set up the register classes.
addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
if (Subtarget->hasFPARMv8()) {
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
if (Subtarget->hasNEON()) {
addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
// Someone set us up the NEON.
if (Subtarget->hasSVE()) {
// Add legal sve predicate types
addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
// Add legal sve data types
addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
if (Subtarget->hasBF16()) {
addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
if (useSVEForFixedLengthVectors()) {
for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))
addRegisterClass(VT, &AArch64::ZPRRegClass);
for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))
addRegisterClass(VT, &AArch64::ZPRRegClass);
for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
for (auto VT :
{ MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
for (auto VT :
{ MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
MVT::nxv2f64 }) {
setCondCodeAction(ISD::SETO, VT, Expand);
setCondCodeAction(ISD::SETOLT, VT, Expand);
setCondCodeAction(ISD::SETOLE, VT, Expand);
setCondCodeAction(ISD::SETULT, VT, Expand);
setCondCodeAction(ISD::SETULE, VT, Expand);
setCondCodeAction(ISD::SETUGE, VT, Expand);
setCondCodeAction(ISD::SETUGT, VT, Expand);
setCondCodeAction(ISD::SETUEQ, VT, Expand);
setCondCodeAction(ISD::SETUNE, VT, Expand);
// Compute derived properties from the register classes
// Provide all sorts of operation actions
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::i32, Custom);
setOperationAction(ISD::SETCC, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::f16, Custom);
setOperationAction(ISD::SETCC, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::f64, Custom);
setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i64, Custom);
setOperationAction(ISD::BR_CC, MVT::f16, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Custom);
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f16, Custom);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
setOperationAction(ISD::JumpTable, MVT::i64, Custom);
setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
setOperationAction(ISD::FREM, MVT::f32, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f80, Expand);
setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
// Custom lowering hooks are needed for XOR
// to fold it into CSINC/CSINV.
setOperationAction(ISD::XOR, MVT::i32, Custom);
setOperationAction(ISD::XOR, MVT::i64, Custom);
// Virtually no operation on f128 is legal, but LLVM can't expand them when
// there's a valid register class, so we need custom operations in most cases.
setOperationAction(ISD::FABS, MVT::f128, Expand);
setOperationAction(ISD::FADD, MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
setOperationAction(ISD::FCOS, MVT::f128, Expand);
setOperationAction(ISD::FDIV, MVT::f128, Custom);
setOperationAction(ISD::FMA, MVT::f128, Expand);
setOperationAction(ISD::FMUL, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Expand);
setOperationAction(ISD::FPOW, MVT::f128, Expand);
setOperationAction(ISD::FREM, MVT::f128, Expand);
setOperationAction(ISD::FRINT, MVT::f128, Expand);
setOperationAction(ISD::FSIN, MVT::f128, Expand);
setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
setOperationAction(ISD::FSQRT, MVT::f128, Expand);
setOperationAction(ISD::FSUB, MVT::f128, Custom);
setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
setOperationAction(ISD::SETCC, MVT::f128, Custom);
setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
setOperationAction(ISD::BR_CC, MVT::f128, Custom);
setOperationAction(ISD::SELECT, MVT::f128, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
// Lowering for many of the conversions is actually specified by the non-f128
// type. The LowerXXX function will be trivial when f128 isn't involved.
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
// Variable arguments.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Custom);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
// Variable-sized objects.
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
if (Subtarget->isTargetWindows())
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
// BlockAddress
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
setOperationAction(ISD::ADDC, MVT::i32, Custom);
setOperationAction(ISD::ADDE, MVT::i32, Custom);
setOperationAction(ISD::SUBC, MVT::i32, Custom);
setOperationAction(ISD::SUBE, MVT::i32, Custom);
setOperationAction(ISD::ADDC, MVT::i64, Custom);
setOperationAction(ISD::ADDE, MVT::i64, Custom);
setOperationAction(ISD::SUBC, MVT::i64, Custom);
setOperationAction(ISD::SUBE, MVT::i64, Custom);
// AArch64 lacks both left-rotate and popcount instructions.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
// AArch64 doesn't have i32 MULH{S|U}.
setOperationAction(ISD::MULHU, MVT::i32, Expand);
setOperationAction(ISD::MULHS, MVT::i32, Expand);
// AArch64 doesn't have {U|S}MUL_LOHI.
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
setOperationAction(ISD::CTPOP, MVT::i128, Custom);
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::SREM, MVT::i64, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i64, Expand);
// Custom lower Add/Sub/Mul with overflow.
setOperationAction(ISD::SADDO, MVT::i32, Custom);
setOperationAction(ISD::SADDO, MVT::i64, Custom);
setOperationAction(ISD::UADDO, MVT::i32, Custom);
setOperationAction(ISD::UADDO, MVT::i64, Custom);
setOperationAction(ISD::SSUBO, MVT::i32, Custom);
setOperationAction(ISD::SSUBO, MVT::i64, Custom);
setOperationAction(ISD::USUBO, MVT::i32, Custom);
setOperationAction(ISD::USUBO, MVT::i64, Custom);
setOperationAction(ISD::SMULO, MVT::i32, Custom);
setOperationAction(ISD::SMULO, MVT::i64, Custom);
setOperationAction(ISD::UMULO, MVT::i32, Custom);
setOperationAction(ISD::UMULO, MVT::i64, Custom);
setOperationAction(ISD::FSIN, MVT::f32, Expand);
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FCOS, MVT::f32, Expand);
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FPOW, MVT::f32, Expand);
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
if (Subtarget->hasFullFP16())
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
setOperationAction(ISD::FREM, MVT::f16, Promote);
setOperationAction(ISD::FREM, MVT::v4f16, Expand);
setOperationAction(ISD::FREM, MVT::v8f16, Expand);
setOperationAction(ISD::FPOW, MVT::f16, Promote);
setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
setOperationAction(ISD::FPOWI, MVT::f16, Promote);
setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
setOperationAction(ISD::FSIN, MVT::f16, Promote);
setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
setOperationAction(ISD::FEXP, MVT::f16, Promote);
setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
setOperationAction(ISD::FEXP2, MVT::f16, Promote);
setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG, MVT::f16, Promote);
setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG10, MVT::f16, Promote);
setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
if (!Subtarget->hasFullFP16()) {
setOperationAction(ISD::SELECT, MVT::f16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
setOperationAction(ISD::SETCC, MVT::f16, Promote);
setOperationAction(ISD::BR_CC, MVT::f16, Promote);
setOperationAction(ISD::FADD, MVT::f16, Promote);
setOperationAction(ISD::FSUB, MVT::f16, Promote);
setOperationAction(ISD::FMUL, MVT::f16, Promote);
setOperationAction(ISD::FDIV, MVT::f16, Promote);
setOperationAction(ISD::FMA, MVT::f16, Promote);
setOperationAction(ISD::FNEG, MVT::f16, Promote);
setOperationAction(ISD::FABS, MVT::f16, Promote);
setOperationAction(ISD::FCEIL, MVT::f16, Promote);
setOperationAction(ISD::FSQRT, MVT::f16, Promote);
setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
setOperationAction(ISD::FRINT, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Promote);
setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
// promote v4f16 to v4f32 when that is known to be safe.
setOperationAction(ISD::FADD, MVT::v4f16, Promote);
setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
setOperationAction(ISD::FABS, MVT::v4f16, Expand);
setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
setOperationAction(ISD::FMA, MVT::v4f16, Expand);
setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
setOperationAction(ISD::FABS, MVT::v8f16, Expand);
setOperationAction(ISD::FADD, MVT::v8f16, Expand);
setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
setOperationAction(ISD::FMA, MVT::v8f16, Expand);
setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
// AArch64 has implementations of a lot of rounding-like FP operations.
for (MVT Ty : {MVT::f32, MVT::f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
setOperationAction(ISD::FMINNUM, Ty, Legal);
setOperationAction(ISD::FMAXNUM, Ty, Legal);
setOperationAction(ISD::FMINIMUM, Ty, Legal);
setOperationAction(ISD::FMAXIMUM, Ty, Legal);
setOperationAction(ISD::LROUND, Ty, Legal);
setOperationAction(ISD::LLROUND, Ty, Legal);
setOperationAction(ISD::LRINT, Ty, Legal);
setOperationAction(ISD::LLRINT, Ty, Legal);
if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
setOperationAction(ISD::FCEIL, MVT::f16, Legal);
setOperationAction(ISD::FRINT, MVT::f16, Legal);
setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
setOperationAction(ISD::FROUND, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
// 128-bit loads and stores can be done without expanding
setOperationAction(ISD::LOAD, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
// custom lowering, as there are no un-paired non-temporal stores and
// legalization will break up 256 bit inputs.
setOperationAction(ISD::STORE, MVT::v32i8, Custom);
setOperationAction(ISD::STORE, MVT::v16i16, Custom);
setOperationAction(ISD::STORE, MVT::v16f16, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v8f32, Custom);
setOperationAction(ISD::STORE, MVT::v4f64, Custom);
setOperationAction(ISD::STORE, MVT::v4i64, Custom);
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
// Issue __sincos_stret if available.
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
} else {
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
if (Subtarget->getTargetTriple().isOSMSVCRT()) {
// MSVCRT doesn't have powi; fall back to pow
setLibcallName(RTLIB::POWI_F32, nullptr);
setLibcallName(RTLIB::POWI_F64, nullptr);
// Make floating-point constants legal for the large code model, so they don't
// become loads from the constant pool.
if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
// AArch64 does not have floating-point extending loads, i1 sign-extending
// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
for (MVT VT : MVT::fp_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f80, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
setOperationAction(ISD::BITCAST, MVT::f16, Custom);
setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, MVT::i8, Legal);
setIndexedLoadAction(im, MVT::i16, Legal);
setIndexedLoadAction(im, MVT::i32, Legal);
setIndexedLoadAction(im, MVT::i64, Legal);
setIndexedLoadAction(im, MVT::f64, Legal);
setIndexedLoadAction(im, MVT::f32, Legal);
setIndexedLoadAction(im, MVT::f16, Legal);
setIndexedLoadAction(im, MVT::bf16, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i16, Legal);
setIndexedStoreAction(im, MVT::i32, Legal);
setIndexedStoreAction(im, MVT::i64, Legal);
setIndexedStoreAction(im, MVT::f64, Legal);
setIndexedStoreAction(im, MVT::f32, Legal);
setIndexedStoreAction(im, MVT::f16, Legal);
setIndexedStoreAction(im, MVT::bf16, Legal);
// Trap.
setOperationAction(ISD::TRAP, MVT::Other, Legal);
if (Subtarget->isTargetWindows())
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// We combine OR nodes for bitfield operations.
// Try to create BICs for vector ANDs.
// Vector add and sub nodes may conceal a high-half opportunity.
// Also, try to fold ADD into CSINC/CSINV..
if (Subtarget->supportsAddressTopByteIgnored())
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemset = Subtarget->requiresStrictAlign()
? MaxStoresPerMemsetOptSize : 32;
MaxGluedStoresPerMemcpy = 4;
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
? MaxStoresPerMemcpyOptSize : 16;
MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
MaxLoadsPerMemcmpOptSize = 4;
MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
? MaxLoadsPerMemcmpOptSize : 8;
EnableExtLdPromotion = true;
// Set required alignment.
// Set preferred alignments.
setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
// Only change the limit for entries in a jump table if specified by
// the sub target, but not at the command line.
unsigned MaxJT = STI.getMaximumJumpTableSize();
if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
if (Subtarget->hasNEON()) {
// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
// silliness like this:
setOperationAction(ISD::FABS, MVT::v1f64, Expand);
setOperationAction(ISD::FADD, MVT::v1f64, Expand);
setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
setOperationAction(ISD::FMA, MVT::v1f64, Expand);
setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
setOperationAction(ISD::FREM, MVT::v1f64, Expand);
setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
setOperationAction(ISD::MUL, MVT::v1i64, Expand);
// AArch64 doesn't have a direct vector ->f32 conversion instructions for
// elements smaller than i32, so promote the input to i32 first.
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
// i8 vector elements also need promotion to i32 for v8i8
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
// Similarly, there is no direct i32 -> f64 vector conversion instruction.
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
if (Subtarget->hasFullFP16()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
} else {
// when AArch64 doesn't have fullfp16 support, promote the input
// to i32 first.
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
// AArch64 doesn't have MUL.2d:
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
// Custom handling for some quad-vector types to detect MULL.
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
// Vector reductions
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
// Saturates
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
setOperationAction(ISD::TRUNCATE, VT, Custom);
for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
// directly.
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
setOperationAction(ISD::MULHS, VT, Legal);
setOperationAction(ISD::MULHU, VT, Legal);
} else {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
// AArch64 has implementations of a lot of rounding-like FP operations.
for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
if (Subtarget->hasFullFP16()) {
for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
if (Subtarget->hasSVE())
setOperationAction(ISD::VSCALE, MVT::i32, Custom);
setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
if (Subtarget->hasSVE()) {
// FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
// splat of 0 or undef) once vector selects supported in SVE codegen. See
// D68877 for more details.
for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
if (isTypeLegal(VT)) {
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);
setOperationAction(ISD::SMIN, VT, Custom);
setOperationAction(ISD::UMIN, VT, Custom);
setOperationAction(ISD::SMAX, VT, Custom);
setOperationAction(ISD::UMAX, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
- if (VT.getScalarType() == MVT::i1)
+ if (VT.getScalarType() == MVT::i1) {
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+ }
for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
if (isTypeLegal(VT)) {
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::FMA, VT, Custom);
// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
if (useSVEForFixedLengthVectors()) {
for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))
for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))
// 64bit results can mean a bigger than NEON input.
for (auto VT : {MVT::v8i8, MVT::v4i16})
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
// 128bit results imply a bigger than NEON input.
for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
setOperationAction(ISD::TRUNCATE, VT, Custom);
for (auto VT : {MVT::v8f16, MVT::v4f32})
setOperationAction(ISD::FP_ROUND, VT, Expand);
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
assert(VT.isVector() && "VT should be a vector type");
if (VT.isFloatingPoint()) {
MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
// Mark vector float intrinsics as expand.
if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
// But we do support custom-lowering for FCOPYSIGN.
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::OR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
for (MVT InnerVT : MVT::all_valuetypes())
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// CNT supports only B element sizes, then use UADDLP to widen.
if (VT != MVT::v8i8 && VT != MVT::v16i8)
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
if (!VT.isFloatingPoint())
setOperationAction(ISD::ABS, VT, Legal);
// [SU][MIN|MAX] are available for all NEON types apart from i64.
if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT, Legal);
// F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
if (VT.isFloatingPoint() &&
(VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
for (unsigned Opcode :
setOperationAction(Opcode, VT, Legal);
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
// By default everything must be expanded.
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
setOperationAction(Op, VT, Expand);
// We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// Lower fixed length vector operations to scalable equivalents.
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR64RegClass);
addTypeForNEON(VT, MVT::v2i32);
void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR128RegClass);
addTypeForNEON(VT, MVT::v4i32);
EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
LLVMContext &C, EVT VT) const {
if (!VT.isVector())
return MVT::i32;
if (VT.isScalableVector())
return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
return VT.changeVectorElementTypeToInteger();
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
const APInt &Demanded,
TargetLowering::TargetLoweringOpt &TLO,
unsigned NewOpc) {
uint64_t OldImm = Imm, NewImm, Enc;
uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
// Return if the immediate is already all zeros, all ones, a bimm32 or a
// bimm64.
if (Imm == 0 || Imm == Mask ||
AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
return false;
unsigned EltSize = Size;
uint64_t DemandedBits = Demanded.getZExtValue();
// Clear bits that are not demanded.
Imm &= DemandedBits;
while (true) {
// The goal here is to set the non-demanded bits in a way that minimizes
// the number of switching between 0 and 1. In order to achieve this goal,
// we set the non-demanded bits to the value of the preceding demanded bits.
// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
// non-demanded bit), we copy bit0 (1) to the least significant 'x',
// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
// The final result is 0b11000011.
uint64_t NonDemandedBits = ~DemandedBits;
uint64_t InvertedImm = ~Imm & DemandedBits;
uint64_t RotatedImm =
((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
uint64_t Sum = RotatedImm + NonDemandedBits;
bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
uint64_t Ones = (Sum + Carry) & NonDemandedBits;
NewImm = (Imm | Ones) & Mask;
// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
// we halve the element size and continue the search.
if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
// We cannot shrink the element size any further if it is 2-bits.
if (EltSize == 2)
return false;
EltSize /= 2;
Mask >>= EltSize;
uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
// Return if there is mismatch in any of the demanded bits of Imm and Hi.
if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
return false;
// Merge the upper and lower halves of Imm and DemandedBits.
Imm |= Hi;
DemandedBits |= DemandedBitsHi;
// Replicate the element across the register width.
while (EltSize < Size) {
NewImm |= NewImm << EltSize;
EltSize *= 2;
assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
"demanded bits should never be altered");
assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
// Create the new constant immediate node.
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue New;
// If the new constant immediate is all-zeros or all-ones, let the target
// independent DAG combine optimize this node.
if (NewImm == 0 || NewImm == OrigMask) {
New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
TLO.DAG.getConstant(NewImm, DL, VT));
// Otherwise, create a machine node so that target independent DAG combine
// doesn't undo this optimization.
} else {
Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
New = SDValue(
TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
return TLO.CombineTo(Op, New);
bool AArch64TargetLowering::targetShrinkDemandedConstant(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
TargetLoweringOpt &TLO) const {
// Delay this optimization to as late as possible.
if (!TLO.LegalOps)
return false;
if (!EnableOptimizeLogicalImm)
return false;
EVT VT = Op.getValueType();
if (VT.isVector())
return false;
unsigned Size = VT.getSizeInBits();
assert((Size == 32 || Size == 64) &&
"i32 or i64 is expected after legalization.");
// Exit early if we demand all bits.
if (DemandedBits.countPopulation() == Size)
return false;
unsigned NewOpc;
switch (Op.getOpcode()) {
return false;
case ISD::AND:
NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
case ISD::OR:
NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
case ISD::XOR:
NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!C)
return false;
uint64_t Imm = C->getZExtValue();
return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
/// computeKnownBitsForTargetNode - Determine which of the bits specified in
/// Mask are known to be either zero or one and return them Known.
void AArch64TargetLowering::computeKnownBitsForTargetNode(
const SDValue Op, KnownBits &Known,
const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
switch (Op.getOpcode()) {
case AArch64ISD::CSEL: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
case AArch64ISD::LOADgot:
case AArch64ISD::ADDlow: {
if (!Subtarget->isTargetILP32())
// In ILP32 mode all valid pointers are in the low 4GB of the address-space.
Known.Zero = APInt::getHighBitsSet(64, 32);
ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
switch (IntID) {
default: return;
case Intrinsic::aarch64_ldaxr:
case Intrinsic::aarch64_ldxr: {
unsigned BitWidth = Known.getBitWidth();
EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
unsigned MemBits = VT.getScalarSizeInBits();
Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IntNo) {
case Intrinsic::aarch64_neon_umaxv:
case Intrinsic::aarch64_neon_uminv: {
// Figure out the datatype of the vector operand. The UMINV instruction
// will zero extend the result, so we can mark as known zero all the
// bits larger than the element datatype. 32-bit or larget doesn't need
// this as those are legal types and will be handled by isel directly.
MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
unsigned BitWidth = Known.getBitWidth();
if (VT == MVT::v8i8 || VT == MVT::v16i8) {
assert(BitWidth >= 8 && "Unexpected width!");
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
Known.Zero |= Mask;
} else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
assert(BitWidth >= 16 && "Unexpected width!");
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
Known.Zero |= Mask;
} break;
MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
EVT) const {
return MVT::i64;
bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Subtarget->requiresStrictAlign())
return false;
if (Fast) {
// Some CPUs are fine with unaligned stores except for 128-bit ones.
*Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
// See comments in performSTORECombine() for more details about
// these conditions.
// Code that uses clang vector extensions can mark that it
// wants unaligned accesses to be treated as fast by
// underspecifying alignment to be 1 or 2.
Align <= 2 ||
// Disregard v2i64. Memcpy lowering produces those and splitting
// them regresses performance on micro-benchmarks and olden/bh.
VT == MVT::v2i64;
return true;
// Same as above but handling LLTs instead.
bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Subtarget->requiresStrictAlign())
return false;
if (Fast) {
// Some CPUs are fine with unaligned stores except for 128-bit ones.
*Fast = !Subtarget->isMisaligned128StoreSlow() ||
Ty.getSizeInBytes() != 16 ||
// See comments in performSTORECombine() for more details about
// these conditions.
// Code that uses clang vector extensions can mark that it
// wants unaligned accesses to be treated as fast by
// underspecifying alignment to be 1 or 2.
Alignment <= 2 ||
// Disregard v2i64. Memcpy lowering produces those and splitting
// them regresses performance on micro-benchmarks and olden/bh.
Ty == LLT::vector(2, 64);
return true;
FastISel *
AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
return AArch64::createFastISel(funcInfo, libInfo);
const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
#define MAKE_CASE(V) \
case V: \
return #V;
switch ((AArch64ISD::NodeType)Opcode) {
#undef MAKE_CASE
return nullptr;
MachineBasicBlock *
AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// We materialise the F128CSEL pseudo-instruction as some control flow and a
// phi node:
// OrigBB:
// [... previous instrs leading to comparison ...]
// TrueBB
// b EndBB
// TrueBB:
// ; Fallthrough
// EndBB:
// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator It = ++MBB->getIterator();
Register DestReg = MI.getOperand(0).getReg();
Register IfTrueReg = MI.getOperand(1).getReg();
Register IfFalseReg = MI.getOperand(2).getReg();
unsigned CondCode = MI.getOperand(3).getImm();
bool NZCVKilled = MI.getOperand(4).isKill();
MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(It, TrueBB);
MF->insert(It, EndBB);
// Transfer rest of current basic-block to EndBB
EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
// TrueBB falls through to the end.
if (!NZCVKilled) {
BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
return EndBB;
MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
MachineInstr &MI, MachineBasicBlock *BB) const {
BB->getParent()->getFunction().getPersonalityFn())) &&
"SEH does not use catchret!");
return BB;
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
#ifndef NDEBUG
llvm_unreachable("Unexpected instruction for custom inserter!");
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
// AArch64 Lowering private implementation.
// Lowering Code
/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
/// CC
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
switch (CC) {
llvm_unreachable("Unknown condition code!");
case ISD::SETNE:
return AArch64CC::NE;
case ISD::SETEQ:
return AArch64CC::EQ;
case ISD::SETGT:
return AArch64CC::GT;
case ISD::SETGE:
return AArch64CC::GE;
case ISD::SETLT:
return AArch64CC::LT;
case ISD::SETLE:
return AArch64CC::LE;
return AArch64CC::HI;
return AArch64CC::HS;
return AArch64CC::LO;
return AArch64CC::LS;
/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static void changeFPCCToAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2) {
CondCode2 = AArch64CC::AL;
switch (CC) {
llvm_unreachable("Unknown FP condition!");
case ISD::SETEQ:
CondCode = AArch64CC::EQ;
case ISD::SETGT:
CondCode = AArch64CC::GT;
case ISD::SETGE:
CondCode = AArch64CC::GE;
CondCode = AArch64CC::MI;
CondCode = AArch64CC::LS;
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GT;
case ISD::SETO:
CondCode = AArch64CC::VC;
case ISD::SETUO:
CondCode = AArch64CC::VS;
CondCode = AArch64CC::EQ;
CondCode2 = AArch64CC::VS;
CondCode = AArch64CC::HI;
CondCode = AArch64CC::PL;
case ISD::SETLT:
CondCode = AArch64CC::LT;
case ISD::SETLE:
CondCode = AArch64CC::LE;
case ISD::SETNE:
CondCode = AArch64CC::NE;
/// Convert a DAG fp condition code to an AArch64 CC.
/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
/// should be AND'ed instead of OR'ed.
static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2) {
CondCode2 = AArch64CC::AL;
switch (CC) {
changeFPCCToAArch64CC(CC, CondCode, CondCode2);
assert(CondCode2 == AArch64CC::AL);
// (a one b)
// == ((a olt b) || (a ogt b))
// == ((a ord b) && (a une b))
CondCode = AArch64CC::VC;
CondCode2 = AArch64CC::NE;
// (a ueq b)
// == ((a uno b) || (a oeq b))
// == ((a ule b) && (a uge b))
CondCode = AArch64CC::PL;
CondCode2 = AArch64CC::LE;
/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
/// CC usable with the vector instructions. Fewer operations are available
/// without a real NZCV register, so we have to use less efficient combinations
/// to get the same effect.
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2,
bool &Invert) {
Invert = false;
switch (CC) {
// Mostly the scalar mappings work fine.
changeFPCCToAArch64CC(CC, CondCode, CondCode2);
case ISD::SETUO:
Invert = true;
case ISD::SETO:
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GE;
// All of the compare-mask comparisons are ordered, but we can switch
// between the two by a double inversion. E.g. ULE == !OGT.
Invert = true;
changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
CondCode, CondCode2);
static bool isLegalArithImmed(uint64_t C) {
// Matches AArch64DAGToDAGISel::SelectArithImmed().
bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
LLVM_DEBUG(dbgs() << "Is imm " << C
<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
return IsLegal;
// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
// can be set differently by this operation. It comes down to whether
// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
// everything is fine. If not then the optimization is wrong. Thus general
// comparisons are only valid if op2 != 0.
// So, finally, the only LLVM-native comparisons that don't mention C and V
// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
// the absence of information about op2.
static bool isCMN(SDValue Op, ISD::CondCode CC) {
return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
SelectionDAG &DAG, SDValue Chain,
bool IsSignaling) {
EVT VT = LHS.getValueType();
assert(VT != MVT::f128);
assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
unsigned Opcode =
return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
const bool FullFP16 =
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
if (VT.isFloatingPoint()) {
assert(VT != MVT::f128);
if (VT == MVT::f16 && !FullFP16) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
VT = MVT::f32;
return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
// The CMP instruction is just an alias for SUBS, and representing it as
// SUBS means that it's possible to get CSE with subtract operations.
// A later phase can perform the optimization of setting the destination
// register to WZR/XZR if it ends up being unused.
unsigned Opcode = AArch64ISD::SUBS;
if (isCMN(RHS, CC)) {
// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
Opcode = AArch64ISD::ADDS;
RHS = RHS.getOperand(1);
} else if (isCMN(LHS, CC)) {
// As we are looking for EQ/NE compares, the operands can be commuted ; can
// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
Opcode = AArch64ISD::ADDS;
LHS = LHS.getOperand(1);
} else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
if (LHS.getOpcode() == ISD::AND) {
// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
DAG.getVTList(VT, MVT_CC),
// Replace all users of (and X, Y) with newly generated (ands X, Y)
DAG.ReplaceAllUsesWith(LHS, ANDSNode);
return ANDSNode.getValue(1);
} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
// Use result of ANDS
return LHS.getValue(1);
return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
/// \defgroup AArch64CCMP CMP;CCMP matching
/// These functions deal with the formation of CMP;CCMP;... sequences.
/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
/// a comparison. They set the NZCV flags to a predefined value if their
/// predicate is false. This allows to express arbitrary conjunctions, for
/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
/// expressed as:
/// cmp A
/// ccmp B, inv(CB), CA
/// check for CB flags
/// This naturally lets us implement chains of AND operations with SETCC
/// operands. And we can even implement some other situations by transforming
/// them:
/// - We can implement (NEG SETCC) i.e. negating a single comparison by
/// negating the flags used in a CCMP/FCCMP operations.
/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
/// by negating the flags we test for afterwards. i.e.
/// NEG (CMP CCMP CCCMP ...) can be implemented.
/// - Note that we can only ever negate all previously processed results.
/// What we can not implement by flipping the flags to test is a negation
/// of two sub-trees (because the negation affects all sub-trees emitted so
/// far, so the 2nd sub-tree we emit would also affect the first).
/// With those tools we can implement some OR operations:
/// - (OR (SETCC A) (SETCC B)) can be implemented via:
/// - After transforming OR to NEG/AND combinations we may be able to use NEG
/// elimination rules from earlier to implement the whole thing as a
/// CCMP/FCCMP chain.
/// As complete example:
/// or (or (setCA (cmp A)) (setCB (cmp B)))
/// (and (setCC (cmp C)) (setCD (cmp D)))"
/// can be reassociated to:
/// or (and (setCC (cmp C)) setCD (cmp D))
// (or (setCA (cmp A)) (setCB (cmp B)))
/// can be transformed to:
/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
/// which can be implemented as:
/// cmp C
/// ccmp D, inv(CD), CC
/// ccmp A, CA, inv(CD)
/// ccmp B, CB, inv(CA)
/// check for CB flags
/// A counterexample is "or (and A B) (and C D)" which translates to
/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
/// can only implement 1 of the inner (not) operations, but not both!
/// @{
/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
ISD::CondCode CC, SDValue CCOp,
AArch64CC::CondCode Predicate,
AArch64CC::CondCode OutCC,
const SDLoc &DL, SelectionDAG &DAG) {
unsigned Opcode = 0;
const bool FullFP16 =
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
if (LHS.getValueType().isFloatingPoint()) {
assert(LHS.getValueType() != MVT::f128);
if (LHS.getValueType() == MVT::f16 && !FullFP16) {
LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
Opcode = AArch64ISD::FCCMP;
} else if (RHS.getOpcode() == ISD::SUB) {
SDValue SubOp0 = RHS.getOperand(0);
if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
// See emitComparison() on why we can only do this for SETEQ and SETNE.
Opcode = AArch64ISD::CCMN;
RHS = RHS.getOperand(1);
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
/// expressed as a conjunction. See \ref AArch64CCMP.
/// \param CanNegate Set to true if we can negate the whole sub-tree just by
/// changing the conditions on the SETCC tests.
/// (this means we can call emitConjunctionRec() with
/// Negate==true on this sub-tree)
/// \param MustBeFirst Set to true if this subtree needs to be negated and we
/// cannot do the negation naturally. We are required to
/// emit the subtree first in this case.
/// \param WillNegate Is true if are called when the result of this
/// subexpression must be negated. This happens when the
/// outer expression is an OR. We can use this fact to know
/// that we have a double negation (or (or ...) ...) that
/// can be implemented for free.
static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
bool &MustBeFirst, bool WillNegate,
unsigned Depth = 0) {
if (!Val.hasOneUse())
return false;
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
if (Val->getOperand(0).getValueType() == MVT::f128)
return false;
CanNegate = true;
MustBeFirst = false;
return true;
// Protect against exponential runtime and stack overflow.
if (Depth > 6)
return false;
if (Opcode == ISD::AND || Opcode == ISD::OR) {
bool IsOR = Opcode == ISD::OR;
SDValue O0 = Val->getOperand(0);
SDValue O1 = Val->getOperand(1);
bool CanNegateL;
bool MustBeFirstL;
if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
return false;
bool CanNegateR;
bool MustBeFirstR;
if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
return false;
if (MustBeFirstL && MustBeFirstR)
return false;
if (IsOR) {
// For an OR expression we need to be able to naturally negate at least
// one side or we cannot do the transformation at all.
if (!CanNegateL && !CanNegateR)
return false;
// If we the result of the OR will be negated and we can naturally negate
// the leafs, then this sub-tree as a whole negates naturally.
CanNegate = WillNegate && CanNegateL && CanNegateR;
// If we cannot naturally negate the whole sub-tree, then this must be
// emitted first.
MustBeFirst = !CanNegate;
} else {
assert(Opcode == ISD::AND && "Must be OR or AND");
// We cannot naturally negate an AND operation.
CanNegate = false;
MustBeFirst = MustBeFirstL || MustBeFirstR;
return true;
return false;
/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
/// Tries to transform the given i1 producing node @p Val to a series compare
/// and conditional compare operations. @returns an NZCV flags producing node
/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
/// transformation was not possible.
/// \p Negate is true if we want this sub-tree being negated just by changing
/// SETCC conditions.
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
AArch64CC::CondCode Predicate) {
// We're at a tree leaf, produce a conditional comparison operation.
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
SDValue LHS = Val->getOperand(0);
SDValue RHS = Val->getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
bool isInteger = LHS.getValueType().isInteger();
if (Negate)
CC = getSetCCInverse(CC, LHS.getValueType());
SDLoc DL(Val);
// Determine OutCC and handle FP special case.
if (isInteger) {
OutCC = changeIntCCToAArch64CC(CC);
} else {
AArch64CC::CondCode ExtraCC;
changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
// Some floating point conditions can't be tested with a single condition
// code. Construct an additional comparison in this case.
if (ExtraCC != AArch64CC::AL) {
SDValue ExtraCmp;
if (!CCOp.getNode())
ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
ExtraCC, DL, DAG);
CCOp = ExtraCmp;
Predicate = ExtraCC;
// Produce a normal comparison if we are first in the chain
if (!CCOp)
return emitComparison(LHS, RHS, CC, DL, DAG);
// Otherwise produce a ccmp.
return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
bool IsOR = Opcode == ISD::OR;
SDValue LHS = Val->getOperand(0);
bool CanNegateL;
bool MustBeFirstL;
bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
assert(ValidL && "Valid conjunction/disjunction tree");
SDValue RHS = Val->getOperand(1);
bool CanNegateR;
bool MustBeFirstR;
bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
assert(ValidR && "Valid conjunction/disjunction tree");
// Swap sub-tree that must come first to the right side.
if (MustBeFirstL) {
assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
std::swap(LHS, RHS);
std::swap(CanNegateL, CanNegateR);
std::swap(MustBeFirstL, MustBeFirstR);
bool NegateR;
bool NegateAfterR;
bool NegateL;
bool NegateAfterAll;
if (Opcode == ISD::OR) {
// Swap the sub-tree that we can negate naturally to the left.
if (!CanNegateL) {
assert(CanNegateR && "at least one side must be negatable");
assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
std::swap(LHS, RHS);
NegateR = false;
NegateAfterR = true;
} else {
// Negate the left sub-tree if possible, otherwise negate the result.
NegateR = CanNegateR;
NegateAfterR = !CanNegateR;
NegateL = true;
NegateAfterAll = !Negate;
} else {
assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
assert(!Negate && "Valid conjunction/disjunction tree");
NegateL = false;
NegateR = false;
NegateAfterR = false;
NegateAfterAll = false;
// Emit sub-trees.
AArch64CC::CondCode RHSCC;
SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
if (NegateAfterR)
RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
if (NegateAfterAll)
OutCC = AArch64CC::getInvertedCondCode(OutCC);
return CmpL;
/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
/// In some cases this is even possible with OR operations in the expression.
/// See \ref AArch64CCMP.
/// \see emitConjunctionRec().
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
AArch64CC::CondCode &OutCC) {
bool DummyCanNegate;
bool DummyMustBeFirst;
if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
return SDValue();
return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
/// @}
/// Returns how profitable it is to fold a comparison's operand's shift and/or
/// extension operations.
static unsigned getCmpOperandFoldingProfit(SDValue Op) {
auto isSupportedExtend = [&](SDValue V) {
if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
return true;
if (V.getOpcode() == ISD::AND)
if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
uint64_t Mask = MaskCst->getZExtValue();
return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
return false;
if (!Op.hasOneUse())
return 0;
if (isSupportedExtend(Op))
return 1;
unsigned Opc = Op.getOpcode();
if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
uint64_t Shift = ShiftCst->getZExtValue();
if (isSupportedExtend(Op.getOperand(0)))
return (Shift <= 4) ? 2 : 1;
EVT VT = Op.getValueType();
if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
return 1;
return 0;
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &AArch64cc, SelectionDAG &DAG,
const SDLoc &dl) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
EVT VT = RHS.getValueType();
uint64_t C = RHSC->getZExtValue();
if (!isLegalArithImmed(C)) {
// Constant does not fit, try adjusting it by one?
switch (CC) {
case ISD::SETLT:
case ISD::SETGE:
if ((VT == MVT::i32 && C != 0x80000000 &&
isLegalArithImmed((uint32_t)(C - 1))) ||
(VT == MVT::i64 && C != 0x80000000ULL &&
isLegalArithImmed(C - 1ULL))) {
C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
RHS = DAG.getConstant(C, dl, VT);
if ((VT == MVT::i32 && C != 0 &&
isLegalArithImmed((uint32_t)(C - 1))) ||
(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
RHS = DAG.getConstant(C, dl, VT);
case ISD::SETLE:
case ISD::SETGT:
if ((VT == MVT::i32 && C != INT32_MAX &&
isLegalArithImmed((uint32_t)(C + 1))) ||
(VT == MVT::i64 && C != INT64_MAX &&
isLegalArithImmed(C + 1ULL))) {
C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
RHS = DAG.getConstant(C, dl, VT);
if ((VT == MVT::i32 && C != UINT32_MAX &&
isLegalArithImmed((uint32_t)(C + 1))) ||
(VT == MVT::i64 && C != UINT64_MAX &&
isLegalArithImmed(C + 1ULL))) {
C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
RHS = DAG.getConstant(C, dl, VT);
// Comparisons are canonicalized so that the RHS operand is simpler than the
// LHS one, the extreme case being when RHS is an immediate. However, AArch64
// can fold some shift+extend operations on the RHS operand, so swap the
// operands if that can be done.
// For example:
// lsl w13, w11, #1
// cmp w13, w12
// can be turned into:
// cmp w12, w11, lsl #1
if (!isa<ConstantSDNode>(RHS) ||
!isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
SDValue Cmp;
AArch64CC::CondCode AArch64CC;
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
// For the i8 operand, the largest immediate is 255, so this can be easily
// encoded in the compare instruction. For the i16 operand, however, the
// largest immediate cannot be encoded in the compare.
// Therefore, use a sign extending load and cmn to avoid materializing the
// -1 constant. For example,
// movz w1, #65535
// ldrh w0, [x0, #0]
// cmp w0, w1
// >
// ldrsh w0, [x0, #0]
// cmn w0, #1
// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
// if and only if (sext LHS) == (sext RHS). The checks are in place to
// ensure both the LHS and RHS are truly zero extended and to make sure the
// transformation is profitable.
if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
LHS.getNode()->hasNUsesOfValue(1, 0)) {
int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
SDValue SExt =
DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
CC, dl, DAG);
AArch64CC = changeIntCCToAArch64CC(CC);
if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
if (!Cmp) {
Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC = changeIntCCToAArch64CC(CC);
AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
return Cmp;
static std::pair<SDValue, SDValue>
getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
"Unsupported value type");
SDValue Value, Overflow;
SDLoc DL(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
unsigned Opc = 0;
switch (Op.getOpcode()) {
llvm_unreachable("Unknown overflow instruction!");
case ISD::SADDO:
Opc = AArch64ISD::ADDS;
CC = AArch64CC::VS;
case ISD::UADDO:
Opc = AArch64ISD::ADDS;
CC = AArch64CC::HS;
case ISD::SSUBO:
Opc = AArch64ISD::SUBS;
CC = AArch64CC::VS;
case ISD::USUBO:
Opc = AArch64ISD::SUBS;
CC = AArch64CC::LO;
// Multiply needs a little bit extra work.
case ISD::SMULO:
case ISD::UMULO: {
CC = AArch64CC::NE;
bool IsSigned = Op.getOpcode() == ISD::SMULO;
if (Op.getValueType() == MVT::i32) {
unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
// For a 32 bit multiply with overflow check we want the instruction
// selector to generate a widening multiply (SMADDL/UMADDL). For that we
// need to generate the following pattern:
// (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
DAG.getConstant(0, DL, MVT::i64));
// On AArch64 the upper 32 bits are always zero extended for a 32 bit
// operation. We need to clear out the upper 32 bits, because we used a
// widening multiply that wrote all 64 bits. In the end this should be a
// noop.
Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
if (IsSigned) {
// The signed overflow check requires more than just a simple check for
// any bit set in the upper 32 bits of the result. These bits could be
// just the sign bits of a negative number. To perform the overflow
// check we have to arithmetic shift right the 32nd bit of the result by
// 31 bits. Then we compare the result to the upper 32 bits.
SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
DAG.getConstant(32, DL, MVT::i64));
UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
DAG.getConstant(31, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
} else {
// The overflow check for unsigned multiply is easy. We only need to
// check if any of the upper 32 bits are set. This can be done with a
// CMP (shifted register). For that we need to generate the following
// pattern:
// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
DAG.getConstant(32, DL, MVT::i64));
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
// For the 64 bit multiply
Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
if (IsSigned) {
SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
DAG.getConstant(63, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
} else {
SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
} // switch (...)
if (Opc) {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
// Emit the AArch64 operation with overflow check.
Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
Overflow = Value.getValue(1);
return std::make_pair(Value, Overflow);
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
bool IsStrict = Op->isStrictFPOpcode();
unsigned Offset = IsStrict ? 1 : 0;
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
MakeLibCallOptions CallOptions;
SDValue Result;
SDLoc dl(Op);
std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
CallOptions, dl, Chain);
return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
SDValue Sel = Op.getOperand(0);
SDValue Other = Op.getOperand(1);
SDLoc dl(Sel);
// If the operand is an overflow checking operation, invert the condition
// code and kill the Not operation. I.e., transform:
// (xor (overflow_op_bool, 1))
// -->
// (csel 1, 0, invert(cc), overflow_op_bool)
// ... which later gets transformed to just a cset instruction with an
// inverted condition code, rather than a cset + eor sequence.
if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
return SDValue();
SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
AArch64CC::CondCode CC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
CCVal, Overflow);
// If neither operand is a SELECT_CC, give up.
if (Sel.getOpcode() != ISD::SELECT_CC)
std::swap(Sel, Other);
if (Sel.getOpcode() != ISD::SELECT_CC)
return Op;
// The folding we want to perform is:
// (xor x, (select_cc a, b, cc, 0, -1) )
// -->
// (csel x, (xor x, -1), cc ...)
// The latter will get matched to a CSINV instruction.
ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
SDValue LHS = Sel.getOperand(0);
SDValue RHS = Sel.getOperand(1);
SDValue TVal = Sel.getOperand(2);
SDValue FVal = Sel.getOperand(3);
// FIXME: This could be generalized to non-integer comparisons.
if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
return Op;
ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
// The values aren't constants, this isn't the pattern we're looking for.
if (!CFVal || !CTVal)
return Op;
// We can commute the SELECT_CC by inverting the condition. This
// might be needed to make this fit into a CSINV pattern.
if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, LHS.getValueType());
// If the constants line up, perform the transform!
if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
FVal = Other;
TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
DAG.getConstant(-1ULL, dl, Other.getValueType()));
return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
CCVal, Cmp);
return Op;
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
unsigned Opc;
bool ExtraOp = false;
switch (Op.getOpcode()) {
llvm_unreachable("Invalid code");
case ISD::ADDC:
Opc = AArch64ISD::ADDS;
case ISD::SUBC:
Opc = AArch64ISD::SUBS;
case ISD::ADDE:
Opc = AArch64ISD::ADCS;
ExtraOp = true;
case ISD::SUBE:
Opc = AArch64ISD::SBCS;
ExtraOp = true;
if (!ExtraOp)
return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
return SDValue();
SDLoc dl(Op);
AArch64CC::CondCode CC;
// The actual operation that sets the overflow or carry flag.
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
// We use 0 and 1 as false and true values.
SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
// We use an inverted condition, because the conditional select is inverted
// too. This will allow it to be selected to a single instruction:
// CSINC Wd, WZR, WZR, invert(cond).
SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
CCVal, Overflow);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
// Prefetch operands are:
// 1: Address to prefetch
// 2: bool isWrite
// 3: int locality (0 = no locality ... 3 = extreme locality)
// 4: bool isDataCache
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
bool IsStream = !Locality;
// When the locality number is set
if (Locality) {
// The front-end should have filtered out the out-of-range values
assert(Locality <= 3 && "Prefetch locality out-of-range");
// The locality degree is the opposite of the cache speed.
// Put the number the other way around.
// The encoding starts at 0 for level 1
Locality = 3 - Locality;
// built the mask value encoding the expected behavior.
unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
(!IsData << 3) | // IsDataCache bit
(Locality << 1) | // Cache level bits
(unsigned)IsStream; // Stream bit
return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
RTLIB::Libcall LC;
LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
EVT SrcVT = SrcVal.getValueType();
if (SrcVT != MVT::f128) {
// Expand cases where the input is a vector bigger than NEON.
if (useSVEForFixedLengthVectorVT(SrcVT))
return SDValue();
// It's legal except when f128 is involved
return Op;
RTLIB::Libcall LC;
LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());
// FP_ROUND node has a second operand indicating whether it is known to be
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
MakeLibCallOptions CallOptions;
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue Result;
SDLoc dl(Op);
std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
CallOptions, dl, Chain);
return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
SelectionDAG &DAG) const {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
EVT InVT = Op.getOperand(0).getValueType();
EVT VT = Op.getValueType();
unsigned NumElts = InVT.getVectorNumElements();
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (InVT.getVectorElementType() == MVT::f16 &&
!Subtarget->hasFullFP16()) {
MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
SDLoc dl(Op);
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
if (VT.getSizeInBits() < InVT.getSizeInBits()) {
SDLoc dl(Op);
SDValue Cv =
DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
if (VT.getSizeInBits() > InVT.getSizeInBits()) {
SDLoc dl(Op);
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
// Type changing conversions are illegal.
return Op;
SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
if (SrcVal.getValueType().isVector())
return LowerVectorFP_TO_INT(Op, DAG);
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
SDLoc dl(Op);
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
if (SrcVal.getValueType() != MVT::f128) {
// It's legal except when f128 is involved
return Op;
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::FP_TO_SINT ||
Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
EVT VT = Op.getValueType();
SDLoc dl(Op);
SDValue In = Op.getOperand(0);
EVT InVT = In.getValueType();
if (VT.getSizeInBits() < InVT.getSizeInBits()) {
MVT CastVT =
In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
if (VT.getSizeInBits() > InVT.getSizeInBits()) {
unsigned CastOpc =
EVT CastVT = VT.changeVectorElementTypeToInteger();
In = DAG.getNode(CastOpc, dl, CastVT, In);
return DAG.getNode(Op.getOpcode(), dl, VT, In);
return Op;
SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVectorINT_TO_FP(Op, DAG);
bool IsStrict = Op->isStrictFPOpcode();
SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
// f16 conversions are promoted to f32 when full fp16 is not supported.
if (Op.getValueType() == MVT::f16 &&
!Subtarget->hasFullFP16()) {
assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
SDLoc dl(Op);
return DAG.getNode(
ISD::FP_ROUND, dl, MVT::f16,
DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
DAG.getIntPtrConstant(0, dl));
// i128 conversions are libcalls.
if (SrcVal.getValueType() == MVT::i128)
return SDValue();
// Other conversions are legal, unless it's to the completely software-based
// fp128.
if (Op.getValueType() != MVT::f128)
return Op;
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::SINT_TO_FP ||
Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
SelectionDAG &DAG) const {
// For iOS, we want to call an alternative entry point: __sincos_stret,
// which returns the values in two S / D registers.
SDLoc dl(Op);
SDValue Arg = Op.getOperand(0);
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
ArgListTy Args;
ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
Entry.IsSExt = false;
Entry.IsZExt = false;
RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
const char *LibcallName = getLibcallName(LC);
SDValue Callee =
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
StructType *RetTy = StructType::get(ArgTy, ArgTy);
TargetLowering::CallLoweringInfo CLI(DAG);
.setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.first;
static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
EVT OpVT = Op.getValueType();
if (OpVT != MVT::f16 && OpVT != MVT::bf16)
return SDValue();
assert(Op.getOperand(0).getValueType() == MVT::i16);
SDLoc DL(Op);
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
return SDValue(
DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
static EVT getExtensionTo64Bits(const EVT &OrigVT) {
if (OrigVT.getSizeInBits() >= 64)
return OrigVT;
assert(OrigVT.isSimple() && "Expecting a simple value type");
MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
switch (OrigSimpleTy) {
default: llvm_unreachable("Unexpected Vector Type");
case MVT::v2i8:
case MVT::v2i16:
return MVT::v2i32;
case MVT::v4i8:
return MVT::v4i16;
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
const EVT &OrigTy,
const EVT &ExtTy,
unsigned ExtOpcode) {
// The vector originally had a size of OrigTy. It was then extended to ExtTy.
// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
// 64-bits we need to insert a new extension so that it will be 64-bits.
assert(ExtTy.is128BitVector() && "Unexpected extension size");
if (OrigTy.getSizeInBits() >= 64)
return N;
// Must extend size to at least 64 bits to be used as an operand for VMULL.
EVT NewVT = getExtensionTo64Bits(OrigTy);
return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
bool isSigned) {
EVT VT = N->getValueType(0);
if (N->getOpcode() != ISD::BUILD_VECTOR)
return false;
for (const SDValue &Elt : N->op_values()) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
unsigned EltSize = VT.getScalarSizeInBits();
unsigned HalfSize = EltSize / 2;
if (isSigned) {
if (!isIntN(HalfSize, C->getSExtValue()))
return false;
} else {
if (!isUIntN(HalfSize, C->getZExtValue()))
return false;
return false;
return true;
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
EVT VT = N->getValueType(0);
SDLoc dl(N);
unsigned EltSize = VT.getScalarSizeInBits() / 2;
unsigned NumElts = VT.getVectorNumElements();
MVT TruncVT = MVT::getIntegerVT(EltSize);
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i != NumElts; ++i) {
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
const APInt &CInt = C->getAPIntValue();
// Element types smaller than 32 bits are not legal, so use i32 elements.
// The values are implicitly truncated so sext vs. zext doesn't matter.
Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
return N->getOpcode() == ISD::SIGN_EXTEND ||
isExtendedBUILD_VECTOR(N, DAG, true);
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
return N->getOpcode() == ISD::ZERO_EXTEND ||
isExtendedBUILD_VECTOR(N, DAG, false);
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
SDNode *N0 = N->getOperand(0).getNode();
SDNode *N1 = N->getOperand(1).getNode();
return N0->hasOneUse() && N1->hasOneUse() &&
isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
return false;
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
SDNode *N0 = N->getOperand(0).getNode();
SDNode *N1 = N->getOperand(1).getNode();
return N0->hasOneUse() && N1->hasOneUse() &&
isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
return false;
SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SelectionDAG &DAG) const {
// The rounding mode is in bits 23:22 of the FPSCR.
// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
// so that the shift + and get folded into a bitfield extract.
SDLoc dl(Op);
SDValue Chain = Op.getOperand(0);
SDValue FPCR_64 = DAG.getNode(
ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
{Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
Chain = FPCR_64.getValue(1);
SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
DAG.getConstant(1U << 22, dl, MVT::i32));
SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
DAG.getConstant(22, dl, MVT::i32));
SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
DAG.getConstant(3, dl, MVT::i32));
return DAG.getMergeValues({AND, Chain}, dl);
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
EVT VT = Op.getValueType();
assert(VT.is128BitVector() && VT.isInteger() &&
"unexpected type for custom-lowering ISD::MUL");
SDNode *N0 = Op.getOperand(0).getNode();
SDNode *N1 = Op.getOperand(1).getNode();
unsigned NewOpc = 0;
bool isMLA = false;
bool isN0SExt = isSignExtended(N0, DAG);
bool isN1SExt = isSignExtended(N1, DAG);
if (isN0SExt && isN1SExt)
NewOpc = AArch64ISD::SMULL;
else {
bool isN0ZExt = isZeroExtended(N0, DAG);
bool isN1ZExt = isZeroExtended(N1, DAG);
if (isN0ZExt && isN1ZExt)
NewOpc = AArch64ISD::UMULL;
else if (isN1SExt || isN1ZExt) {
// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
if (isN1SExt && isAddSubSExt(N0, DAG)) {
NewOpc = AArch64ISD::SMULL;
isMLA = true;
} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
NewOpc = AArch64ISD::UMULL;
isMLA = true;
} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
std::swap(N0, N1);
NewOpc = AArch64ISD::UMULL;
isMLA = true;
if (!NewOpc) {
if (VT == MVT::v2i64)
// Fall through to expand this. It is not legal.
return SDValue();
// Other vector multiplications are legal.
return Op;
// Legalize to a S/UMULL instruction
SDLoc DL(Op);
SDValue Op0;
SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
if (!isMLA) {
Op0 = skipExtensionForVectorMULL(N0, DAG);
assert(Op0.getValueType().is64BitVector() &&
Op1.getValueType().is64BitVector() &&
"unexpected types for extended operands to VMULL");
return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
EVT Op1VT = Op1.getValueType();
return DAG.getNode(N0->getOpcode(), DL, VT,
DAG.getNode(NewOpc, DL, VT,
DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
DAG.getNode(NewOpc, DL, VT,
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
int Pattern) {
return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
DAG.getTargetConstant(Pattern, DL, MVT::i32));
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc dl(Op);
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
case Intrinsic::aarch64_neon_abs: {
EVT Ty = Op.getValueType();
if (Ty == MVT::i64) {
SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
} else {
report_fatal_error("Unexpected type for AArch64 NEON intrinic");
case Intrinsic::aarch64_neon_smax:
return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_umax:
return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_smin:
return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_umin:
return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_sunpkhi:
return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
case Intrinsic::aarch64_sve_sunpklo:
return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
case Intrinsic::aarch64_sve_uunpkhi:
return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
case Intrinsic::aarch64_sve_uunpklo:
return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
case Intrinsic::aarch64_sve_clasta_n:
return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::aarch64_sve_clastb_n:
return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::aarch64_sve_lasta:
return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_lastb:
return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_rev:
return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
case Intrinsic::aarch64_sve_tbl:
return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_trn1:
return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_trn2:
return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_uzp1:
return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_uzp2:
return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_zip1:
return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_zip2:
return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_sve_ptrue:
return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
case Intrinsic::aarch64_sve_dupq_lane:
return LowerDUPQLane(Op, DAG);
case Intrinsic::aarch64_sve_convert_from_svbool:
return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
case Intrinsic::aarch64_sve_convert_to_svbool: {
EVT OutVT = Op.getValueType();
EVT InVT = Op.getOperand(1).getValueType();
// Return the operand if the cast isn't changing type,
// i.e. <n x 16 x i1> -> <n x 16 x i1>
if (InVT == OutVT)
return Op.getOperand(1);
// Otherwise, zero the newly introduced lanes.
SDValue Reinterpret =
DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
SDValue MaskReinterpret =
DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask);
return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret);
case Intrinsic::aarch64_sve_insr: {
SDValue Scalar = Op.getOperand(2);
EVT ScalarTy = Scalar.getValueType();
if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
Op.getOperand(1), Scalar);
case Intrinsic::localaddress: {
const auto &MF = DAG.getMachineFunction();
const auto *RegInfo = Subtarget->getRegisterInfo();
unsigned Reg = RegInfo->getLocalAddressRegister(MF);
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
case Intrinsic::eh_recoverfp: {
// FIXME: This needs to be implemented to correctly handle highly aligned
// stack objects. For now we simply return the incoming FP. Refer D53541
// for more details.
SDValue FnOp = Op.getOperand(1);
SDValue IncomingFPOp = Op.getOperand(2);
GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
if (!Fn)
" must take a function as the first argument");
return IncomingFPOp;
case Intrinsic::aarch64_neon_vsri:
case Intrinsic::aarch64_neon_vsli: {
EVT Ty = Op.getValueType();
if (!Ty.isVector())
report_fatal_error("Unexpected type for aarch64_neon_vsli");
assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
case Intrinsic::aarch64_neon_srhadd:
case Intrinsic::aarch64_neon_urhadd: {
bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();
// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
SelectionDAG &DAG) {
assert(VT.isVector() && "VT should be a vector type");
assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
SDValue Value = ST->getValue();
// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
// the word lane which represent the v4i8 subvector. It optimizes the store
// to:
// xtn v0.8b, v0.8h
// str s0, [x0]
SDValue Undef = DAG.getUNDEF(MVT::i16);
SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
{Undef, Undef, Undef, Undef});
SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
Value, UndefVec);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
Trunc, DAG.getConstant(0, DL, MVT::i64));
return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
ST->getBasePtr(), ST->getMemOperand());
// Custom lowering for any store, vector or scalar and/or default or with
// a truncate operations. Currently only custom lower truncate operation
// from vector v4i16 to v4i8 or volatile stores of i128.
SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc Dl(Op);
StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
assert (StoreNode && "Can only custom lower store nodes");
SDValue Value = StoreNode->getValue();
EVT VT = Value.getValueType();
EVT MemVT = StoreNode->getMemoryVT();
if (VT.isVector()) {
if (useSVEForFixedLengthVectorVT(VT))
return LowerFixedLengthVectorStoreToSVE(Op, DAG);
unsigned AS = StoreNode->getAddressSpace();
Align Alignment = StoreNode->getAlign();
if (Alignment < MemVT.getStoreSize() &&
!allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
nullptr)) {
return scalarizeVectorStore(StoreNode, DAG);
if (StoreNode->isTruncatingStore()) {
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
// the custom lowering, as there are no un-paired non-temporal stores and
// legalization will break up 256 bit inputs.
if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
MemVT.getVectorElementCount().Min % 2u == 0 &&
((MemVT.getScalarSizeInBits() == 8u ||
MemVT.getScalarSizeInBits() == 16u ||
MemVT.getScalarSizeInBits() == 32u ||
MemVT.getScalarSizeInBits() == 64u))) {
SDValue Lo =
StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
SDValue Hi = DAG.getNode(
DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
return Result;
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
SDValue Lo =
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
DAG.getConstant(0, Dl, MVT::i64));
SDValue Hi =
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
DAG.getConstant(1, Dl, MVT::i64));
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
return Result;
return SDValue();
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Custom lowering: ");
switch (Op.getOpcode()) {
llvm_unreachable("unimplemented operand");
return SDValue();
return LowerBITCAST(Op, DAG);
case ISD::GlobalAddress:
return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress:
return LowerGlobalTLSAddress(Op, DAG);
case ISD::SETCC:
return LowerSETCC(Op, DAG);
case ISD::BR_CC:
return LowerBR_CC(Op, DAG);
return LowerSELECT(Op, DAG);
return LowerSELECT_CC(Op, DAG);
case ISD::JumpTable:
return LowerJumpTable(Op, DAG);
case ISD::BR_JT:
return LowerBR_JT(Op, DAG);
case ISD::ConstantPool:
return LowerConstantPool(Op, DAG);
case ISD::BlockAddress:
return LowerBlockAddress(Op, DAG);
return LowerVASTART(Op, DAG);
return LowerVACOPY(Op, DAG);
case ISD::VAARG:
return LowerVAARG(Op, DAG);
case ISD::ADDC:
case ISD::ADDE:
case ISD::SUBC:
case ISD::SUBE:
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
case ISD::USUBO:
case ISD::SMULO:
case ISD::UMULO:
return LowerXALUO(Op, DAG);
case ISD::FADD:
if (useSVEForFixedLengthVectorVT(Op.getValueType()))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
case ISD::FSUB:
return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
case ISD::FMUL:
return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
case ISD::FMA:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
case ISD::FDIV:
return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
return LowerFP_ROUND(Op, DAG);
return LowerFP_EXTEND(Op, DAG);
return LowerFRAMEADDR(Op, DAG);
return LowerSPONENTRY(Op, DAG);
return LowerRETURNADDR(Op, DAG);
return LowerBUILD_VECTOR(Op, DAG);
return LowerVECTOR_SHUFFLE(Op, DAG);
return LowerSPLAT_VECTOR(Op, DAG);
case ISD::SDIV:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
case ISD::UDIV:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
case ISD::SMIN:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
case ISD::UMIN:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
case ISD::SMAX:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
case ISD::UMAX:
return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL:
return LowerVectorSRA_SRL_SHL(Op, DAG);
return LowerShiftLeftParts(Op, DAG);
return LowerShiftRightParts(Op, DAG);
case ISD::CTPOP:
return LowerCTPOP(Op, DAG);
return LowerFCOPYSIGN(Op, DAG);
case ISD::OR:
return LowerVectorOR(Op, DAG);
case ISD::XOR:
return LowerXOR(Op, DAG);
return LowerPREFETCH(Op, DAG);
return LowerINT_TO_FP(Op, DAG);
return LowerFP_TO_INT(Op, DAG);
return LowerFSINCOS(Op, DAG);
return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL:
return LowerMUL(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
return LowerVECREDUCE(Op, DAG);
return LowerATOMIC_LOAD_SUB(Op, DAG);
return LowerATOMIC_LOAD_AND(Op, DAG);
return LowerVSCALE(Op, DAG);
return LowerTRUNCATE(Op, DAG);
case ISD::LOAD:
if (useSVEForFixedLengthVectorVT(Op.getValueType()))
return LowerFixedLengthVectorLoadToSVE(Op, DAG);
llvm_unreachable("Unexpected request to lower ISD::LOAD");
case ISD::ADD:
if (useSVEForFixedLengthVectorVT(Op.getValueType()))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
llvm_unreachable("Unexpected request to lower ISD::ADD");
bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
// Prefer NEON unless larger SVE registers are available.
return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
if (!useSVEForFixedLengthVectors())
return false;
if (!VT.isFixedLengthVector())
return false;
// Fixed length predicates should be promoted to i8.
// NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
if (VT.getVectorElementType() == MVT::i1)
return false;
// Don't use SVE for vectors we cannot scalarize if required.
switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
return false;
case MVT::i8:
case MVT::i16:
case MVT::i32:
case MVT::i64:
case MVT::f16:
case MVT::f32:
case MVT::f64:
// Ensure NEON MVTs only belong to a single register class.
if (VT.getSizeInBits() <= 128)
return false;
// Don't use SVE for types that don't fit.
if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
return false;
// TODO: Perhaps an artificial restriction, but worth having whilst getting
// the base fixed length SVE support in place.
if (!VT.isPow2VectorType())
return false;
return true;
// Calling Convention Implementation
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) const {
switch (CC) {
report_fatal_error("Unsupported calling convention.");
case CallingConv::WebKit_JS:
return CC_AArch64_WebKit_JS;
case CallingConv::GHC:
return CC_AArch64_GHC;
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::PreserveMost:
case CallingConv::CXX_FAST_TLS:
case CallingConv::Swift:
if (Subtarget->isTargetWindows() && IsVarArg)
return CC_AArch64_Win64_VarArg;
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
if (!IsVarArg)
return CC_AArch64_DarwinPCS;
return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
: CC_AArch64_DarwinPCS_VarArg;
case CallingConv::Win64:
return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
case CallingConv::CFGuard_Check:
return CC_AArch64_Win64_CFGuard_Check;
case CallingConv::AArch64_VectorCall:
case CallingConv::AArch64_SVE_VectorCall:
return CC_AArch64_AAPCS;
CCAssignFn *
AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
DenseMap<unsigned, SDValue> CopiedRegs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
// At this point, Ins[].VT may already be promoted to i32. To correctly
// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
// we use a special version of AnalyzeFormalArguments to pass in ValVT and
// LocVT.
unsigned NumArgs = Ins.size();
Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ValVT = Ins[i].VT;
if (Ins[i].isOrigArg()) {
std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
CurArgIdx = Ins[i].getOrigArgIndex();
// Get type of the original argument.
EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
/*AllowUnknown*/ true);
MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
ValVT = MVT::i8;
else if (ActualMVT == MVT::i16)
ValVT = MVT::i16;
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
bool Res =
AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
assert(!Res && "Call operand has unhandled type");
assert(ArgLocs.size() == Ins.size());
SmallVector<SDValue, 16> ArgValues;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (Ins[i].Flags.isByVal()) {
// Byval is used for HFAs in the PCS, but the system should work in a
// non-compliant manner for larger structs.
EVT PtrVT = getPointerTy(DAG.getDataLayout());
int Size = Ins[i].Flags.getByValSize();
unsigned NumRegs = (Size + 7) / 8;
// FIXME: This works on big-endian for composite byvals, which are the common
// case. It should also work for fundamental types too.
unsigned FrameIdx =
MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue ArgValue;
if (VA.isRegLoc()) {
// Arguments stored in registers.
EVT RegVT = VA.getLocVT();
const TargetRegisterClass *RC;
if (RegVT == MVT::i32)
RC = &AArch64::GPR32RegClass;
else if (RegVT == MVT::i64)
RC = &AArch64::GPR64RegClass;
else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
RC = &AArch64::FPR16RegClass;
else if (RegVT == MVT::f32)
RC = &AArch64::FPR32RegClass;
else if (RegVT == MVT::f64 || RegVT.is64BitVector())
RC = &AArch64::FPR64RegClass;
else if (RegVT == MVT::f128 || RegVT.is128BitVector())
RC = &AArch64::FPR128RegClass;
else if (RegVT.isScalableVector() &&
RegVT.getVectorElementType() == MVT::i1)
RC = &AArch64::PPRRegClass;
else if (RegVT.isScalableVector())
RC = &AArch64::ZPRRegClass;
llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
// Transform the arguments in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
// If this is an 8, 16 or 32-bit value, it is really passed promoted
// to 64 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
switch (VA.getLocInfo()) {
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
case CCValAssign::AExt:
case CCValAssign::SExt:
case CCValAssign::ZExt:
case CCValAssign::AExtUpper:
ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
DAG.getConstant(32, DL, RegVT));
ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
} else { // VA.isRegLoc()
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
? VA.getLocVT().getSizeInBits()
: VA.getValVT().getSizeInBits()) / 8;
uint32_t BEAlign = 0;
if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
BEAlign = 8 - ArgSize;
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
MVT MemVT = VA.getValVT();
switch (VA.getLocInfo()) {
case CCValAssign::Trunc:
case CCValAssign::BCvt:
MemVT = VA.getLocVT();
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
MemVT = VA.getLocVT();
case CCValAssign::SExt:
case CCValAssign::ZExt:
case CCValAssign::AExt:
ArgValue = DAG.getExtLoad(
ExtType, DL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
if (VA.getLocInfo() == CCValAssign::Indirect) {
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
// If value is passed via pointer - do a load.
ArgValue =
DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
ArgValue, DAG.getValueType(MVT::i32));
// varargs
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
if (isVarArg) {
if (!Subtarget->isTargetDarwin() || IsWin64) {
// The AAPCS variadic function ABI is identical to the non-variadic
// one. As a result there may be more arguments in registers and we should
// save them for future reference.
// Win64 variadic functions also pass arguments in registers, but all float
// arguments are passed in integer registers.
saveVarArgRegisters(CCInfo, DAG, DL, Chain);
// This will point to the next argument passed via stack.
unsigned StackOffset = CCInfo.getNextStackOffset();
// We currently pass all varargs at 8-byte alignment, or 4 for ILP32
StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
if (MFI.hasMustTailInVarArgFunc()) {
SmallVector<MVT, 2> RegParmTypes;
// Compute the set of forwarded registers. The rest are scratch.
SmallVectorImpl<ForwardedRegister> &Forwards =
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
// Conservatively forward X8, since it might be used for aggregate return.
if (!CCInfo.isAllocated(AArch64::X8)) {
unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
// On Windows, InReg pointers must be returned, so record the pointer in a
// virtual register at the start of the function so it can be returned in the
// epilogue.
if (IsWin64) {
for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
if (Ins[I].Flags.isInReg()) {
MVT PtrTy = getPointerTy(DAG.getDataLayout());
Register Reg =
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
unsigned StackArgSize = CCInfo.getNextStackOffset();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
// This is a non-standard ABI so by fiat I say we're allowed to make full
// use of the stack area to be popped, which must be aligned to 16 bytes in
// any case:
StackArgSize = alignTo(StackArgSize, 16);
// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
// a multiple of 16.
// This realignment carries over to the available bytes below. Our own
// callers will guarantee the space is free by giving an aligned value to
// Even if we're not expected to free up the space, it's useful to know how
// much is there while considering tail calls (because we can reuse it).
if (Subtarget->hasCustomCallingConv())
return Chain;
void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
SelectionDAG &DAG,
const SDLoc &DL,
SDValue &Chain) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
SmallVector<SDValue, 8> MemOps;
static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
AArch64::X3, AArch64::X4, AArch64::X5,
AArch64::X6, AArch64::X7 };
static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
int GPRIdx = 0;
if (GPRSaveSize != 0) {
if (IsWin64) {
GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
if (GPRSaveSize & 15)
// The extra size here, if triggered, will always be 8.
MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
} else
GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
(i - FirstVariadicGPR) * 8)
: MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
if (Subtarget->hasFPARMv8() && !IsWin64) {
static const MCPhysReg FPRArgRegs[] = {
AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
int FPRIdx = 0;
if (FPRSaveSize != 0) {
FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getConstant(16, DL, PtrVT));
if (!MemOps.empty()) {
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
DenseMap<unsigned, SDValue> CopiedRegs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
CCInfo.AnalyzeCallResult(Ins, RetCC);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign VA = RVLocs[i];
// Pass 'this' value directly from the argument to return value, to avoid
// reg unit interference
if (i == 0 && isThisReturn) {
assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
"unexpected return calling convention register assignment");
// Avoid copying a physreg twice since RegAllocFast is incompetent and only
// allows one use of a physreg per block.
SDValue Val = CopiedRegs.lookup(VA.getLocReg());
if (!Val) {
Val =
DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
Chain = Val.getValue(1);
InFlag = Val.getValue(2);
CopiedRegs[VA.getLocReg()] = Val;
switch (VA.getLocInfo()) {
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
case CCValAssign::AExtUpper:
Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
DAG.getConstant(32, DL, VA.getLocVT()));
case CCValAssign::AExt:
case CCValAssign::ZExt:
Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
return Chain;
/// Return true if the calling convention is one that we can guarantee TCO for.
static bool canGuaranteeTCO(CallingConv::ID CC) {
return CC == CallingConv::Fast;
/// Return true if we might ever do TCO for calls with this calling convention.
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
case CallingConv::PreserveMost:
case CallingConv::Swift:
return true;
return canGuaranteeTCO(CC);
bool AArch64TargetLowering::isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
if (!mayTailCallThisCC(CalleeCC))
return false;
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
// When using the Windows calling convention on a non-windows OS, we want
// to back up and restore X18 in such functions; we can't do a tail call
// from those functions.
if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
CalleeCC != CallingConv::Win64)
return false;
// Byval parameters hand the function a pointer directly into the stack area
// we want to reuse during a tail call. Working around this *is* possible (see
// X86) but less efficient and uglier in LowerCall.
for (Function::const_arg_iterator i = CallerF.arg_begin(),
e = CallerF.arg_end();
i != e; ++i) {
if (i->hasByValAttr())
return false;
// On Windows, "inreg" attributes signify non-aggregate indirect returns.
// In this case, it is necessary to save/restore X0 in the callee. Tail
// call opt interferes with this. So we disable tail call opt when the
// caller has an argument with "inreg" attribute.
// FIXME: Check whether the callee also has an "inreg" argument.
if (i->hasInRegAttr())
return false;
if (getTargetMachine().Options.GuaranteedTailCallOpt)
return canGuaranteeTCO(CalleeCC) && CCMatch;
// Externally-defined functions with weak linkage should not be
// tail-called on AArch64 when the OS does not support dynamic
// pre-emption of symbols, as the AAELF spec requires normal calls
// to undefined weak functions to be replaced with a NOP or jump to the
// next instruction. The behaviour of branch instructions in this
// situation (as used for tail calls) is implementation-defined, so we
// cannot rely on the linker replacing the tail call with a return.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
const Triple &TT = getTargetMachine().getTargetTriple();
if (GV->hasExternalWeakLinkage() &&
(!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
return false;
// Now we search for cases where we can use a tail call without changing the
// ABI. Sibcall is used in some places (particularly gcc) to refer to this
// concept.
// I want anyone implementing a new calling convention to think long and hard
// about this assert.
assert((!isVarArg || CalleeCC == CallingConv::C) &&
"Unexpected variadic calling convention");
LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// At least two cases here: if caller is fastcc then we can't have any
// memory arguments (we'd be expected to clean up the stack afterwards). If
// caller is C then we could potentially use its argument area.
// FIXME: for now we take the most conservative of these in both cases:
// disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
for (const CCValAssign &ArgLoc : ArgLocs)
if (!ArgLoc.isRegLoc())
return false;
// Check that the call results are passed in the same way.
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
CCAssignFnForCall(CalleeCC, isVarArg),
CCAssignFnForCall(CallerCC, isVarArg)))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (Subtarget->hasCustomCallingConv()) {
TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
// Nothing more to check if the callee is taking no arguments
if (Outs.empty())
return true;
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
// If any of the arguments is passed indirectly, it must be SVE, so the
// 'getBytesInStackArgArea' is not sufficient to determine whether we need to
// allocate space on the stack. That is why we determine this explicitly here
// the call cannot be a tailcall.
if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
assert((A.getLocInfo() != CCValAssign::Indirect ||
A.getValVT().isScalableVector()) &&
"Expected value to be scalable");
return A.getLocInfo() == CCValAssign::Indirect;
return false;
// If the stack arguments for this call do not fit into our own save area then
// the call cannot be made tail.
if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
return false;
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
return false;
return true;
SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
SelectionDAG &DAG,
MachineFrameInfo &MFI,
int ClobberedFI) const {
SmallVector<SDValue, 8> ArgChains;
int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
// Include the original chain at the beginning of the list. When this is
// used by target LowerCall hooks, this helps legalize find the
// Add a chain value for each stack argument corresponding
for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
UE = DAG.getEntryNode().getNode()->use_end();
U != UE; ++U)
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
if (FI->getIndex() < 0) {
int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
int64_t InLastByte = InFirstByte;
InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
(FirstByte <= InFirstByte && InFirstByte <= LastByte))
ArgChains.push_back(SDValue(L, 1));
// Build a tokenfactor for all the chains.
return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
bool TailCallOpt) const {
return CallCC == CallingConv::Fast && TailCallOpt;
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
/// and add input and output parameter nodes.
AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
MachineFunction::CallSiteInfo CSInfo;
bool IsThisReturn = false;
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
bool IsSibCall = false;
if (IsTailCall) {
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// A sibling call is one where we're under the usual C ABI and not planning
// to change that but can still do a tail call:
if (!TailCallOpt && IsTailCall)
IsSibCall = true;
if (IsTailCall)
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
if (IsVarArg) {
// Handle fixed and variable vector arguments differently.
// Variable vector arguments always go into memory.
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
/*IsVarArg=*/ !Outs[i].IsFixed);
bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
assert(!Res && "Call operand has unhandled type");
} else {
// At this point, Outs[].VT may already be promoted to i32. To correctly
// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
// we use a special version of AnalyzeCallOperands to pass in ValVT and
// LocVT.
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ValVT = Outs[i].VT;
// Get type of the original argument.
EVT ActualVT = getValueType(DAG.getDataLayout(),
/*AllowUnknown*/ true);
MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
ValVT = MVT::i8;
else if (ActualMVT == MVT::i16)
ValVT = MVT::i16;
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
assert(!Res && "Call operand has unhandled type");
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
if (IsSibCall) {
// Since we're not changing the ABI to make this a tail call, the memory
// operands are already available in the caller's incoming argument space.
NumBytes = 0;
// FPDiff is the byte offset of the call's argument area from the callee's.
// Stores to callee stack arguments will be placed in FixedStackSlots offset
// by this amount for a tail call. In a sibling call it must be 0 because the
// caller will deallocate the entire stack and the callee still expects its
// arguments to begin at SP+0. Completely unused for non-tail calls.
int FPDiff = 0;
if (IsTailCall && !IsSibCall) {
unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
// Since callee will pop argument stack as a tail call, we must keep the
// popped size 16-byte aligned.
NumBytes = alignTo(NumBytes, 16);
// FPDiff will be negative if this tail call requires more space than we
// would automatically have in our incoming argument space. Positive if we
// can actually shrink the stack.
FPDiff = NumReusableBytes - NumBytes;
// The stack pointer must be 16-byte aligned at all times it's used for a
// memory operation, which in practice means at *all* times and in
// particular across call boundaries. Therefore our own arguments started at
// a 16-byte aligned SP and the delta applied for the tail call should
// satisfy the same constraint.
assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallSet<unsigned, 8> RegsUsed;
SmallVector<SDValue, 8> MemOpChains;
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
RegsToPass.emplace_back(F.PReg, Val);
// Walk the register/memloc assignments, inserting copies/loads.
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
// Promote the value if needed.
switch (VA.getLocInfo()) {
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
case CCValAssign::AExt:
if (Outs[i].ArgVT == MVT::i1) {
// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
case CCValAssign::AExtUpper:
assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
DAG.getConstant(32, DL, VA.getLocVT()));
case CCValAssign::BCvt:
Arg = DAG.getBitcast(VA.getLocVT(), Arg);
case CCValAssign::Trunc:
Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
case CCValAssign::FPExt:
Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
case CCValAssign::Indirect:
assert(VA.getValVT().isScalableVector() &&
"Only scalable vectors can be passed indirectly");
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
int FI = MFI.CreateStackObject(
VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
MFI.setStackID(FI, TargetStackID::SVEVector);
SDValue SpillSlot = DAG.getFrameIndex(
FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
Chain = DAG.getStore(
Chain, DL, Arg, SpillSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
Arg = SpillSlot;
if (VA.isRegLoc()) {
if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
Outs[0].VT == MVT::i64) {
assert(VA.getLocVT() == MVT::i64 &&
"unexpected calling convention register assignment");
assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
"unexpected use of 'returned'");
IsThisReturn = true;
if (RegsUsed.count(VA.getLocReg())) {
// If this register has already been used then we're trying to pack
// parts of an [N x i32] into an X-register. The extension type will
// take care of putting the two halves in the right place but we have to
// combine them.
SDValue &Bits =
std::find_if(RegsToPass.begin(), RegsToPass.end(),
[=](const std::pair<unsigned, SDValue> &Elt) {
return Elt.first == VA.getLocReg();
Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
// Call site info is used for function's parameter entry value
// tracking. For now we track only simple cases when parameter
// is transferred through whole register.
CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(),
[&VA](MachineFunction::ArgRegPair ArgReg) {
return ArgReg.Reg == VA.getLocReg();
} else {
RegsToPass.emplace_back(VA.getLocReg(), Arg);
const TargetOptions &Options = DAG.getTarget().Options;
if (Options.EmitCallSiteInfo)
CSInfo.emplace_back(VA.getLocReg(), i);
} else {
SDValue DstAddr;
MachinePointerInfo DstInfo;
// FIXME: This works on big-endian for composite byvals, which are the
// common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
unsigned OpSize;
if (VA.getLocInfo() == CCValAssign::Indirect)
OpSize = VA.getLocVT().getSizeInBits();
OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
: VA.getValVT().getSizeInBits();
OpSize = (OpSize + 7) / 8;
if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
!Flags.isInConsecutiveRegs()) {
if (OpSize < 8)
BEAlign = 8 - OpSize;
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset + BEAlign;
SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
if (IsTailCall) {
Offset = Offset + FPDiff;
int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
DstAddr = DAG.getFrameIndex(FI, PtrVT);
DstInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
// Make sure any stack arguments overlapping with where we're storing
// are loaded before this eventual operation. Otherwise they'll be
// clobbered.
Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
} else {
SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
if (Outs[i].Flags.isByVal()) {
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
SDValue Cpy = DAG.getMemcpy(
Chain, DL, DstAddr, Arg, SizeNode,
/*isVol = */ false, /*AlwaysInline = */ false,
/*isTailCall = */ false, DstInfo, MachinePointerInfo());
} else {
// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
// promoted to a legal register type i32, we should truncate Arg back to
// i1/i8/i16.
if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
VA.getValVT() == MVT::i16)
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (auto &RegToPass : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
RegToPass.second, InFlag);
InFlag = Chain.getValue(1);
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
// node so that legalize doesn't hack it.
if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
auto GV = G->getGlobal();
unsigned OpFlags =
Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
if (OpFlags & AArch64II::MO_GOT) {
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
} else {
const GlobalValue *GV = G->getGlobal();
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
Subtarget->isTargetMachO()) {
const char *Sym = S->getSymbol();
Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
} else {
const char *Sym = S->getSymbol();
Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
// We don't usually want to end the call-sequence here because we would tidy
// the frame up *after* the call, however in the ABI-changing tail-call case
// we've carefully laid out the parameters so that when sp is reset they'll be
// in the correct location.
if (IsTailCall && !IsSibCall) {
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
InFlag = Chain.getValue(1);
std::vector<SDValue> Ops;
if (IsTailCall) {
// Each tail call may have to adjust the stack by a different amount, so
// this information must travel along with the operation for eventual
// consumption by emitEpilogue.
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
// Add argument registers to the end of the list so that they are known live
// into the call.
for (auto &RegToPass : RegsToPass)
// Check callee args/returns for SVE registers and set calling convention
// accordingly.
if (CallConv == CallingConv::C) {
bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
return Out.VT.isScalableVector();
bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
return In.VT.isScalableVector();
if (CalleeInSVE || CalleeOutSVE)
CallConv = CallingConv::AArch64_SVE_VectorCall;
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
if (IsThisReturn) {
// For 'this' returns, use the X0-preserving mask if applicable
Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
if (!Mask) {
IsThisReturn = false;
Mask = TRI->getCallPreservedMask(MF, CallConv);
} else
Mask = TRI->getCallPreservedMask(MF, CallConv);
if (Subtarget->hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(MF, &Mask);
if (TRI->isAnyArgRegReserved(MF))
assert(Mask && "Missing call preserved mask for calling convention");
if (InFlag.getNode())
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
// If we're doing a tall call, use a TC_RETURN here rather than an
// actual call instruction.
if (IsTailCall) {
SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
return Ret;
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
uint64_t CalleePopBytes =
DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(CalleePopBytes, DL, true),
InFlag, DL);
if (!Ins.empty())
InFlag = Chain.getValue(1);
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
InVals, IsThisReturn,
IsThisReturn ? OutVals[0] : SDValue());
bool AArch64TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC);
AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
auto &MF = DAG.getMachineFunction();
auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
CCInfo.AnalyzeReturn(Outs, RetCC);
// Copy the result values into the output registers.
SDValue Flag;
SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
SmallSet<unsigned, 4> RegsUsed;
for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[realRVLocIdx];
switch (VA.getLocInfo()) {
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
if (Outs[i].ArgVT == MVT::i1) {
// AAPCS requires i1 to be zero-extended to i8 by the producer of the
// value. This is strictly redundant on Darwin (which uses "zeroext
// i1"), but will be optimised out before ISel.
Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
case CCValAssign::AExt:
case CCValAssign::ZExt:
Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
case CCValAssign::AExtUpper:
assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
DAG.getConstant(32, DL, VA.getLocVT()));
if (RegsUsed.count(VA.getLocReg())) {
SDValue &Bits =
std::find_if(RetVals.begin(), RetVals.end(),
[=](const std::pair<unsigned, SDValue> &Elt) {
return Elt.first == VA.getLocReg();
Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
} else {
RetVals.emplace_back(VA.getLocReg(), Arg);
SmallVector<SDValue, 4> RetOps(1, Chain);
for (auto &RetVal : RetVals) {
Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
Flag = Chain.getValue(1);
DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
// Windows AArch64 ABIs require that for returning structs by value we copy
// the sret argument into X0 for the return.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into X0.
if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
unsigned RetValReg = AArch64::X0;
Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
Flag = Chain.getValue(1);
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
if (I) {
for (; *I; ++I) {
if (AArch64::GPR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
else if (AArch64::FPR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
if (Flag.getNode())
return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
// Other Lowering Code
SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
N->getOffset(), Flag);
SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
N->getOffset(), Flag);
SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
SelectionDAG &DAG,
unsigned Flag) const {
return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
// (loadGOT sym)
template <class NodeTy>
SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes instead of using a wrapper node.
return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
template <class NodeTy>
SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
AArch64ISD::WrapperLarge, DL, Ty,
getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
// (addlow (adrp %hi(sym)) %lo(sym))
template <class NodeTy>
SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
SDValue Lo = getTargetNode(N, Ty, DAG,
AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
// (adr sym)
template <class NodeTy>
SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
unsigned Flags) const {
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GN->getGlobal();
unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
if (OpFlags != AArch64II::MO_NO_FLAG)
assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
"unexpected offset in global node");
// This also catches the large code model case for Darwin, and tiny code
// model with got relocations.
if ((OpFlags & AArch64II::MO_GOT) != 0) {
return getGOT(GN, DAG, OpFlags);
SDValue Result;
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
Result = getAddrLarge(GN, DAG, OpFlags);
} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
Result = getAddrTiny(GN, DAG, OpFlags);
} else {
Result = getAddr(GN, DAG, OpFlags);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
return Result;
/// Convert a TLS address reference into the correct sequence of loads
/// and calls to compute the variable's address (for Darwin, currently) and
/// return an SDValue containing the final node.
/// Darwin only has one TLS scheme which must be capable of dealing with the
/// fully general situation, in the worst case. This means:
/// + "extern __thread" declaration.
/// + Defined in a possibly unknown dynamic library.
/// The general system is that each __thread variable has a [3 x i64] descriptor
/// which contains information used by the runtime to calculate the address. The
/// only part of this the compiler needs to know about is the first xword, which
/// contains a function pointer that must be called with the address of the
/// entire descriptor in "x0".
/// Since this descriptor may be in a different unit, in general even the
/// descriptor must be accessed via an indirect load. The "ideal" code sequence
/// is:
/// adrp x0, _var@TLVPPAGE
/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
/// ; the function pointer
/// blr x1 ; Uses descriptor address in x0
/// ; Address of _var is now in x0.
/// If the address of _var's descriptor *is* known to the linker, then it can
/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
/// a slight efficiency gain.
AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin() &&
"This function expects a Darwin target");
SDLoc DL(Op);
MVT PtrVT = getPointerTy(DAG.getDataLayout());
MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
SDValue TLVPAddr =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
// The first entry in the descriptor is a function pointer that we must call
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet = DAG.getLoad(
PtrMemVT, DL, Chain, DescAddr,
/* Alignment = */ PtrMemVT.getSizeInBits() / 8,
MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
Chain = FuncTLVGet.getValue(1);
// Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getTLSCallPreservedMask();
if (Subtarget->hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
// Finally, we can make the call. This is just a degenerate version of a
// normal AArch64 call node: x0 takes the address of the descriptor, and
// returns the address of the variable in this thread.
Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
Chain =
DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
DAG.getRegisterMask(Mask), Chain.getValue(1));
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
/// Convert a thread-local variable reference into a sequence of instructions to
/// compute the variable's address for the local exec TLS model of ELF targets.
/// The sequence depends on the maximum TLS area size.
SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
SDValue ThreadBase,
const SDLoc &DL,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue TPOff, Addr;
switch (DAG.getTarget().Options.TLSSize) {
llvm_unreachable("Unexpected TLS size");
case 12: {
// mrs x0, TPIDR_EL0
// add x0, x0, :tprel_lo12:a
SDValue Var = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
DAG.getTargetConstant(0, DL, MVT::i32)),
case 24: {
// mrs x0, TPIDR_EL0
// add x0, x0, :tprel_hi12:a
// add x0, x0, :tprel_lo12_nc:a
SDValue HiVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
DAG.getTargetConstant(0, DL, MVT::i32)),
return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
DAG.getTargetConstant(0, DL, MVT::i32)),
case 32: {
// mrs x1, TPIDR_EL0
// movz x0, #:tprel_g1:a
// movk x0, #:tprel_g0_nc:a
// add x0, x1, x0
SDValue HiVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
DAG.getTargetConstant(16, DL, MVT::i32)),
TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
case 48: {
// mrs x1, TPIDR_EL0
// movz x0, #:tprel_g2:a
// movk x0, #:tprel_g1_nc:a
// movk x0, #:tprel_g0_nc:a
// add x0, x1, x0
SDValue HiVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
SDValue MiVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
DAG.getTargetConstant(32, DL, MVT::i32)),
TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
DAG.getTargetConstant(16, DL, MVT::i32)),
TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
/// When accessing thread-local variables under either the general-dynamic or
/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
/// is a function pointer to carry out the resolution.
/// The sequence is:
/// adrp x0, :tlsdesc:var
/// ldr x1, [x0, #:tlsdesc_lo12:var]
/// add x0, x0, #:tlsdesc_lo12:var
/// .tlsdesccall var
/// blr x1
/// (TPIDR_EL0 offset now in x0)
/// The above sequence must be produced unscheduled, to enable the linker to
/// optimize/relax this sequence.
/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
/// above sequence, and expanded really late in the compilation flow, to ensure
/// the sequence is produced as per above.
SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
const SDLoc &DL,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain =
DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
SDValue Glue = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetELF() && "This function expects an ELF target");
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
if (Model == TLSModel::LocalDynamic)
Model = TLSModel::GeneralDynamic;
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
Model != TLSModel::LocalExec)
report_fatal_error("ELF TLS only supported in small memory model or "
"in local exec TLS model");
// Different choices can be made for the maximum size of the TLS area for a
// module. For the small address model, the default TLS size is 16MiB and the
// maximum TLS size is 4GiB.
// FIXME: add tiny and large code model support for TLS access models other
// than local exec. We currently generate the same code as small for tiny,
// which may be larger than needed.
SDValue TPOff;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
const GlobalValue *GV = GA->getGlobal();
SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
if (Model == TLSModel::LocalExec) {
return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
} else if (Model == TLSModel::InitialExec) {
TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
} else if (Model == TLSModel::LocalDynamic) {
// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
// the beginning of the module's TLS region, followed by a DTPREL offset
// calculation.
// These accesses will need deduplicating if there's more than one.
AArch64FunctionInfo *MFI =
// The call needs a relocation too for linker relaxation. It doesn't make
// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
// the address.
SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
// Now we can calculate the offset from TPIDR_EL0 to this module's
// thread-local area.
TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
// Now use :dtprel_whatever: operations to calculate this variable's offset
// in its thread-storage area.
SDValue HiVar = DAG.getTargetGlobalAddress(
GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, MVT::i64, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
} else if (Model == TLSModel::GeneralDynamic) {
// The call needs a relocation too for linker relaxation. It doesn't make
// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
// the address.
SDValue SymAddr =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
// Finally we can make a call to calculate the offset from tpidr_el0.
TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
} else
llvm_unreachable("Unsupported ELF TLS access model");
return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
SDValue Chain = DAG.getEntryNode();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
// Load the ThreadLocalStoragePointer from the TEB
// A pointer to the TLS array is located at offset 0x58 from the TEB.
SDValue TLSArray =
DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
Chain = TLSArray.getValue(1);
// Load the TLS index from the C runtime;
// This does the same as getAddr(), but without having a GlobalAddressSDNode.
// This also does the same as LOADgot, but using a generic i32 load,
// while LOADgot only loads i64.
SDValue TLSIndexHi =
DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
"_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
SDValue TLSIndex =
DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
Chain = TLSIndex.getValue(1);
// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
// offset into the TLSArray.
TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
DAG.getConstant(3, DL, PtrVT));
SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
Chain = TLS.getValue(1);
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GA->getGlobal();
SDValue TGAHi = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue TGALo = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
// Add the offset from the start of the .tls section (section base).
SDValue Addr =
SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
DAG.getTargetConstant(0, DL, MVT::i32)),
Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
return Addr;
SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
if (Subtarget->isTargetDarwin())
return LowerDarwinGlobalTLSAddress(Op, DAG);
if (Subtarget->isTargetELF())
return LowerELFGlobalTLSAddress(Op, DAG);
if (Subtarget->isTargetWindows())
return LowerWindowsGlobalTLSAddress(Op, DAG);
llvm_unreachable("Unexpected platform trying to use TLS");
SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
MachineFunction &MF = DAG.getMachineFunction();
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
// will not be produced, as they are conditional branch instructions that do
// not set flags.
bool ProduceNonFlagSettingCondBr =
// Handle f128 first, since lowering it will result in comparing the return
// value of a libcall against zero, which is just what the rest of LowerBR_CC
// is expecting to deal with.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
if (!RHS.getNode()) {
RHS = DAG.getConstant(0, dl, LHS.getValueType());
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
return SDValue();
// The actual operation with overflow check.
AArch64CC::CondCode OFCC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
if (CC == ISD::SETNE)
OFCC = getInvertedCondCode(OFCC);
SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
if (LHS.getValueType().isInteger()) {
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
// If the RHS of the comparison is zero, we can potentially fold this
// to a specialized branch.
const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
if (CC == ISD::SETEQ) {
// See if we can use a TBZ to fold in an AND as well.
// TBZ has a smaller branch displacement than CBZ. If the offset is
// out of bounds, a late MI-layer pass rewrites branches.
// 403.gcc is an example that hits this case.
if (LHS.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
isPowerOf2_64(LHS.getConstantOperandVal(1))) {
SDValue Test = LHS.getOperand(0);
uint64_t Mask = LHS.getConstantOperandVal(1);
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
} else if (CC == ISD::SETNE) {
// See if we can use a TBZ to fold in an AND as well.
// TBZ has a smaller branch displacement than CBZ. If the offset is
// out of bounds, a late MI-layer pass rewrites branches.
// 403.gcc is an example that hits this case.
if (LHS.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
isPowerOf2_64(LHS.getConstantOperandVal(1))) {
SDValue Test = LHS.getOperand(0);
uint64_t Mask = LHS.getConstantOperandVal(1);
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
uint64_t Mask = LHS.getValueSizeInBits() - 1;
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
DAG.getConstant(Mask, dl, MVT::i64), Dest);
if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
uint64_t Mask = LHS.getValueSizeInBits() - 1;
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
DAG.getConstant(Mask, dl, MVT::i64), Dest);
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue BR1 =
DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
if (CC2 != AArch64CC::AL) {
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
return BR1;
SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue In1 = Op.getOperand(0);
SDValue In2 = Op.getOperand(1);
EVT SrcVT = In2.getValueType();
if (SrcVT.bitsLT(VT))
In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
else if (SrcVT.bitsGT(VT))
In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
uint64_t EltMask;
SDValue VecVal1, VecVal2;
auto setVecVal = [&] (int Idx) {
if (!VT.isVector()) {
VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
DAG.getUNDEF(VecVT), In1);
VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
DAG.getUNDEF(VecVT), In2);
} else {
VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
EltMask = 0x80000000ULL;
} else if (VT == MVT::f64 || VT == MVT::v2f64) {
VecVT = MVT::v2i64;
// We want to materialize a mask with the high bit set, but the AdvSIMD
// immediate moves cannot materialize that in a single instruction for
// 64-bit elements. Instead, materialize zero and then negate it.
EltMask = 0;
} else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
EltMask = 0x8000ULL;
} else {
llvm_unreachable("Invalid type for copysign!");
SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
// If we couldn't materialize the mask above, then the mask vector will be
// the zero vector, and we need to negate it here.
if (VT == MVT::f64 || VT == MVT::v2f64) {
BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
SDValue Sel =
DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
if (VT == MVT::f16)
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
if (VT == MVT::f32)
return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
else if (VT == MVT::f64)
return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
return SDValue();
if (!Subtarget->hasNEON())
return SDValue();
// While there is no integer popcount instruction, it can
// be more efficiently lowered to the following sequence that uses
// AdvSIMD registers/instructions as long as the copies to/from
// the AdvSIMD registers are cheap.
// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
// CNT V0.8B, V0.8B // 8xbyte pop-counts
// ADDV B0, V0.8B // sum 8xbyte pop-counts
// UMOV X0, V0.B[0] // copy byte result back to integer reg
SDValue Val = Op.getOperand(0);
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (VT == MVT::i32 || VT == MVT::i64) {
if (VT == MVT::i32)
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
SDValue UaddLV = DAG.getNode(
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
if (VT == MVT::i64)
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
return UaddLV;
} else if (VT == MVT::i128) {
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
SDValue UaddLV = DAG.getNode(
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
"Unexpected type for custom ctpop lowering");
EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
Val = DAG.getBitcast(VT8Bit, Val);
Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
unsigned EltSize = 8;
unsigned NumElts = VT.is64BitVector() ? 8 : 16;
while (EltSize != VT.getScalarSizeInBits()) {
EltSize *= 2;
NumElts /= 2;
MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
Val = DAG.getNode(
DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
return Val;
SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVSETCC(Op, DAG);
bool IsStrict = Op->isStrictFPOpcode();
bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
unsigned OpNo = IsStrict ? 1 : 0;
SDValue Chain;
if (IsStrict)
Chain = Op.getOperand(0);
SDValue LHS = Op.getOperand(OpNo + 0);
SDValue RHS = Op.getOperand(OpNo + 1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
SDLoc dl(Op);
// We chose ZeroOrOneBooleanContents, so use zero and one.
EVT VT = Op.getValueType();
SDValue TVal = DAG.getConstant(1, dl, VT);
SDValue FVal = DAG.getConstant(0, dl, VT);
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets picked up by the next if statement.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
// If softenSetCCOperands returned a scalar, use it.
if (!RHS.getNode()) {
assert(LHS.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
if (LHS.getValueType().isInteger()) {
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(
LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
// Now we know we're dealing with FP values.
assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
LHS.getValueType() == MVT::f64);
// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
// and do the comparison.
SDValue Cmp;
if (IsStrict)
Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
SDValue Res;
if (CC2 == AArch64CC::AL) {
changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
} else {
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
// totally clean. Some of them require two CSELs to implement. As is in
// this case, we emit the first CSEL and then emit a second using the output
// of the first as the RHS. We're effectively OR'ing the two CC's together.
// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue CS1 =
DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
SDValue RHS, SDValue TVal,
SDValue FVal, const SDLoc &dl,
SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
if (!RHS.getNode()) {
RHS = DAG.getConstant(0, dl, LHS.getValueType());
// Also handle f16, for which we need to do a f32 comparison.
if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
// Next, handle integers.
if (LHS.getValueType().isInteger()) {
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
unsigned Opcode = AArch64ISD::CSEL;
// If both the TVal and the FVal are constants, see if we can swap them in
// order to for a CSINV or CSINC out of them.
ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, LHS.getValueType());
} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, LHS.getValueType());
} else if (TVal.getOpcode() == ISD::XOR) {
// If TVal is a NOT we want to swap TVal and FVal so that we can match
// with a CSINV rather than a CSEL.
if (isAllOnesConstant(TVal.getOperand(1))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, LHS.getValueType());
} else if (TVal.getOpcode() == ISD::SUB) {
// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
// that we can match with a CSNEG rather than a CSEL.
if (isNullConstant(TVal.getOperand(0))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, LHS.getValueType());
} else if (CTVal && CFVal) {
const int64_t TrueVal = CTVal->getSExtValue();
const int64_t FalseVal = CFVal->getSExtValue();
bool Swap = false;
// If both TVal and FVal are constants, see if FVal is the
// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
// instead of a CSEL in that case.
if (TrueVal == ~FalseVal) {
Opcode = AArch64ISD::CSINV;
} else if (TrueVal == -FalseVal) {
Opcode = AArch64ISD::CSNEG;
} else if (TVal.getValueType() == MVT::i32) {
// If our operands are only 32-bit wide, make sure we use 32-bit
// arithmetic for the check whether we can use CSINC. This ensures that
// the addition in the check will wrap around properly in case there is
// an overflow (which would not be the case if we do the check with
// 64-bit arithmetic).
const uint32_t TrueVal32 = CTVal->getZExtValue();
const uint32_t FalseVal32 = CFVal->getZExtValue();
if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
Opcode = AArch64ISD::CSINC;
if (TrueVal32 > FalseVal32) {
Swap = true;
// 64-bit check whether we can use CSINC.
} else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
Opcode = AArch64ISD::CSINC;
if (TrueVal > FalseVal) {
Swap = true;
// Swap TVal and FVal if necessary.
if (Swap) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, LHS.getValueType());
if (Opcode != AArch64ISD::CSEL) {
// Drop FVal since we can get its value by simply inverting/negating
// TVal.
FVal = TVal;
// Avoid materializing a constant when possible by reusing a known value in
// a register. However, don't perform this optimization if the known value
// is one, zero or negative one in the case of a CSEL. We can always
// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
// FVal, respectively.
ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
// "a != C ? x : a" to avoid materializing C.
if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
TVal = LHS;
else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
FVal = LHS;
} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
// avoid materializing C.
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
Opcode = AArch64ISD::CSINV;
TVal = LHS;
FVal = DAG.getConstant(0, dl, FVal.getValueType());
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
EVT VT = TVal.getValueType();
return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
// Now we know we're dealing with FP values.
assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
LHS.getValueType() == MVT::f64);
assert(LHS.getValueType() == RHS.getValueType());
EVT VT = TVal.getValueType();
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two CSELs to implement.
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
if (DAG.getTarget().Options.UnsafeFPMath) {
// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
if (RHSVal && RHSVal->isZero()) {
ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
TVal = LHS;
else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
CFVal && CFVal->isZero() &&
FVal.getValueType() == LHS.getValueType())
FVal = LHS;
// Emit first, and possibly only, CSEL.
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
// If we need a second CSEL, emit it, using the output of the first as the
// RHS. We're effectively OR'ing the two CC's together.
if (CC2 != AArch64CC::AL) {
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
// Otherwise, return the output of the first CSEL.
return CS1;
SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
SDLoc DL(Op);
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SelectionDAG &DAG) const {
SDValue CCVal = Op->getOperand(0);
SDValue TVal = Op->getOperand(1);
SDValue FVal = Op->getOperand(2);
SDLoc DL(Op);
EVT Ty = Op.getValueType();
if (Ty.isScalableVector()) {
SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
// instruction.
if (ISD::isOverflowIntrOpRes(CCVal)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
return SDValue();
AArch64CC::CondCode OFCC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
CCVal, Overflow);
// Lower it the same way as we would lower a SELECT_CC node.
ISD::CondCode CC;
if (CCVal.getOpcode() == ISD::SETCC) {
LHS = CCVal.getOperand(0);
RHS = CCVal.getOperand(1);
CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
} else {
LHS = CCVal;
RHS = DAG.getConstant(0, DL, CCVal.getValueType());
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
SelectionDAG &DAG) const {
// Jump table entries as PC relative offsets. No additional tweaking
// is necessary here. Just get the address of the jump table.
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
!Subtarget->isTargetMachO()) {
return getAddrLarge(JT, DAG);
} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
return getAddrTiny(JT, DAG);
return getAddr(JT, DAG);
SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
SelectionDAG &DAG) const {
// Jump table entries as PC relative offsets. No additional tweaking
// is necessary here. Just get the address of the jump table.
SDLoc DL(Op);
SDValue JT = Op.getOperand(1);
SDValue Entry = Op.getOperand(2);
int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
SDNode *Dest =
DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
SDValue(Dest, 0));
SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
// Use the GOT for the large code model on iOS.
if (Subtarget->isTargetMachO()) {
return getGOT(CP, DAG);
return getAddrLarge(CP, DAG);
} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
return getAddrTiny(CP, DAG);
} else {
return getAddr(CP, DAG);
SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
!Subtarget->isTargetMachO()) {
return getAddrLarge(BA, DAG);
} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
return getAddrTiny(BA, DAG);
return getAddr(BA, DAG);
SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
SelectionDAG &DAG) const {
AArch64FunctionInfo *FuncInfo =
SDLoc DL(Op);
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
SelectionDAG &DAG) const {
AArch64FunctionInfo *FuncInfo =
SDLoc DL(Op);
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
? FuncInfo->getVarArgsGPRIndex()
: FuncInfo->getVarArgsStackIndex(),
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
SelectionDAG &DAG) const {
// The layout of the va_list struct is specified in the AArch64 Procedure Call
// Standard, section B.3.
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue VAList = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
SmallVector<SDValue, 4> MemOps;
// void *__stack at offset 0
SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
MachinePointerInfo(SV), /* Alignment = */ 8));
// void *__gr_top at offset 8
int GPRSize = FuncInfo->getVarArgsGPRSize();
if (GPRSize > 0) {
SDValue GRTop, GRTopAddr;
GRTopAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
DAG.getConstant(GPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
MachinePointerInfo(SV, 8),
/* Alignment = */ 8));
// void *__vr_top at offset 16
int FPRSize = FuncInfo->getVarArgsFPRSize();
if (FPRSize > 0) {
SDValue VRTop, VRTopAddr;
VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(16, DL, PtrVT));
VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
DAG.getConstant(FPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
MachinePointerInfo(SV, 16),
/* Alignment = */ 8));
// int __gr_offs at offset 24
SDValue GROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
MachinePointerInfo(SV, 24), /* Alignment = */ 4));
// int __vr_offs at offset 28
SDValue VROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
MachinePointerInfo(SV, 28), /* Alignment = */ 4));
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
return LowerWin64_VASTART(Op, DAG);
else if (Subtarget->isTargetDarwin())
return LowerDarwin_VASTART(Op, DAG);
return LowerAAPCS_VASTART(Op, DAG);
SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
SelectionDAG &DAG) const {
// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
// pointer.
SDLoc DL(Op);
unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
unsigned VaListSize = (Subtarget->isTargetDarwin() ||
Subtarget->isTargetWindows()) ? PtrSize : 32;
const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
DAG.getConstant(VaListSize, DL, MVT::i32),
Align(PtrSize), false, false, false,
MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin() &&
"automatic va_arg instruction only works on Darwin");
const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
MaybeAlign Align(Op.getConstantOperandVal(3));
unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
auto PtrVT = getPointerTy(DAG.getDataLayout());
auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
SDValue VAList =
DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
Chain = VAList.getValue(1);
VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
if (Align && *Align > MinSlotSize) {
VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(Align->value() - 1, DL, PtrVT));
VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
// Scalar integer and FP values smaller than 64 bits are implicitly extended
// up to 64 bits. At the very least, we have to increase the striding of the
// vaargs list to match this, and for FP values we need to introduce
// FP_ROUND nodes as well.
if (VT.isInteger() && !VT.isVector())
ArgSize = std::max(ArgSize, MinSlotSize);
bool NeedFPTrunc = false;
if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
ArgSize = 8;
NeedFPTrunc = true;
// Increment the pointer, VAList, to the next vaarg
SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(ArgSize, DL, PtrVT));
VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
// Store the incremented VAList to the legalized pointer
SDValue APStore =
DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
// Load the actual argument out of the pointer VAList
if (NeedFPTrunc) {
// Load the value as an f64.
SDValue WideFP =
DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
// Round the value down to an f32.
SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
DAG.getIntPtrConstant(1, DL));
SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
// Merge the rounded value with the chain output of the load.
return DAG.getMergeValues(Ops, DL);
return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDValue FrameAddr =
DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
while (Depth--)
FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
if (Subtarget->isTargetILP32())
FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
return FrameAddr;
SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
EVT VT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
int FI = MFI.CreateFixedObject(4, 0, false);
return DAG.getFrameIndex(FI, VT);
#include ""
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
Register AArch64TargetLowering::
getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
Register Reg = MatchRegisterName(RegName);
if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
if (!Subtarget->isXRegisterReserved(DwarfRegNum))
Reg = 0;
if (Reg)
return Reg;
report_fatal_error(Twine("Invalid register name \""
+ StringRef(RegName) + "\"."));
SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue FrameAddr =
DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
if (Depth) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
return DAG.getLoad(VT, DL, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
// Return LR, which contains the return address. Mark it an implicit live-in.
unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
// is "undef". We wanted 0, so CSEL it directly.
SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
HiBitsForLo =
DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
HiBitsForLo, CCVal, Cmp);
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
SDValue LoForNormalShift =
DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
dl, DAG);
CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
LoForNormalShift, CCVal, Cmp);
// AArch64 shifts larger than the register width are wrapped rather than
// clamped, so we can't just emit "hi >> x".
SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
SDValue HiForBigShift =
Opc == ISD::SRA
? DAG.getNode(Opc, dl, VT, ShOpHi,
DAG.getConstant(VTBits - 1, dl, MVT::i64))
: DAG.getConstant(0, dl, VT);
SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
HiForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
assert(Op.getOpcode() == ISD::SHL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
// is "undef". We wanted 0, so CSEL it directly.
SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
LoBitsForHi =
DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
LoBitsForHi, CCVal, Cmp);
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
SDValue HiForNormalShift =
DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
dl, DAG);
CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
HiForNormalShift, CCVal, Cmp);
// AArch64 shifts of larger than register sizes are wrapped rather than
// clamped, so we can't just emit "lo << a" if a is too big.
SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
LoForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
bool AArch64TargetLowering::isOffsetFoldingLegal(
const GlobalAddressSDNode *GA) const {
// Offsets are folded in the DAG combine rather than here so that we can
// intelligently choose an offset based on the uses.
return false;
bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool OptForSize) const {
bool IsLegal = false;
// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
// 16-bit case when target has full fp16 support.
// FIXME: We should be able to handle f128 as well with a clever lowering.
const APInt ImmInt = Imm.bitcastToAPInt();
if (VT == MVT::f64)
IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
else if (VT == MVT::f32)
IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
else if (VT == MVT::f16 && Subtarget->hasFullFP16())
IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
// TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
// generate that fmov.
// If we can not materialize in immediate field for fmov, check if the
// value can be encoded as the immediate operand of a logical instruction.
// The immediate value will be created with either MOVZ, MOVN, or ORR.
if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
// however the mov+fmov sequence is always better because of the reduced
// cache pressure. The timings are still the same if you consider
// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
// movw+movk is fused). So we limit up to 2 instrdduction at most.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
IsLegal = Insn.size() <= Limit;
LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
<< " imm value: "; Imm.dump(););
return IsLegal;
// AArch64 Optimization Hooks
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
SDValue Operand, SelectionDAG &DAG,
int &ExtraSteps) {
EVT VT = Operand.getValueType();
if (ST->hasNEON() &&
(VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
VT == MVT::f32 || VT == MVT::v1f32 ||
VT == MVT::v2f32 || VT == MVT::v4f32)) {
if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
// For the reciprocal estimates, convergence is quadratic, so the number
// of digits is doubled after each iteration. In ARMv8, the accuracy of
// the initial estimate is 2^-8. Thus the number of extra steps to refine
// the result for float (23 mantissa bits) is 2 and for double (52
// mantissa bits) is 3.
ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
return SDValue();
SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps,
bool &UseOneConst,
bool Reciprocal) const {
if (Enabled == ReciprocalEstimate::Enabled ||
(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
DAG, ExtraSteps)) {
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
SDNodeFlags Flags;
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
if (!Reciprocal) {
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
// Correct the result if the operand is 0.0.
Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
VT, Eq, Operand, Estimate);
ExtraSteps = 0;
return Estimate;
return SDValue();
SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps) const {
if (Enabled == ReciprocalEstimate::Enabled)
if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
DAG, ExtraSteps)) {
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
SDNodeFlags Flags;
// Newton reciprocal iteration: E * (2 - X * E)
// AArch64 reciprocal iteration instruction: (2 - M * N)
for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
Estimate, Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
ExtraSteps = 0;
return Estimate;
return SDValue();
// AArch64 Inline Assembly Support
// Table of Constraints
// TODO: This is the current set of constraints supported by ARM for the
// compiler, not all of them may make sense.
// r - A general register
// w - An FP/SIMD register of some size in the range v0-v31
// x - An FP/SIMD register of some size in the range v0-v15
// I - Constant that can be used with an ADD instruction
// J - Constant that can be used with a SUB instruction
// K - Constant that can be used with a 32-bit logical instruction
// L - Constant that can be used with a 64-bit logical instruction
// M - Constant that can be used as a 32-bit MOV immediate
// N - Constant that can be used as a 64-bit MOV immediate
// Q - A memory reference with base register and no offset
// S - A symbolic address
// Y - Floating point constant zero
// Z - Integer constant zero
// Note that general register operands will be output using their 64-bit x
// register name, whatever the size of the variable, unless the asm operand
// is prefixed by the %w modifier. Floating-point and SIMD register operands
// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
// %q modifier.
const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
// At this point, we have to lower this constraint to something else, so we
// lower it to an "r" or "w". However, by doing this we will force the result
// to be in register, while the X constraint is much more permissive.
// Although we are correct (we are free to emit anything, without
// constraints), we might break use cases that would expect us to be more
// efficient and emit something else.
if (!Subtarget->hasFPARMv8())
return "r";
if (ConstraintVT.isFloatingPoint())
return "w";
if (ConstraintVT.isVector() &&
(ConstraintVT.getSizeInBits() == 64 ||
ConstraintVT.getSizeInBits() == 128))
return "w";
return "r";
enum PredicateConstraint {
static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
PredicateConstraint P = PredicateConstraint::Invalid;
if (Constraint == "Upa")
P = PredicateConstraint::Upa;
if (Constraint == "Upl")
P = PredicateConstraint::Upl;
return P;
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'x':
case 'w':
case 'y':
return C_RegisterClass;
// An address with a single base register. Due to the way we
// currently handle addresses it is the same as 'r'.
case 'Q':
return C_Memory;
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'Y':
case 'Z':
return C_Immediate;
case 'z':
case 'S': // A symbolic address
return C_Other;
} else if (parsePredicateConstraint(Constraint) !=
return C_RegisterClass;
return TargetLowering::getConstraintType(Constraint);
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
case 'x':
case 'w':
case 'y':
if (type->isFloatingPointTy() || type->isVectorTy())
weight = CW_Register;
case 'z':
weight = CW_Constant;
case 'U':
if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
weight = CW_Register;
return weight;
std::pair<unsigned, const TargetRegisterClass *>
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
if (VT.getSizeInBits() == 64)
return std::make_pair(0U, &AArch64::GPR64commonRegClass);
return std::make_pair(0U, &AArch64::GPR32commonRegClass);
case 'w':
if (!Subtarget->hasFPARMv8())
if (VT.isScalableVector())
return std::make_pair(0U, &AArch64::ZPRRegClass);
if (VT.getSizeInBits() == 16)
return std::make_pair(0U, &AArch64::FPR16RegClass);
if (VT.getSizeInBits() == 32)
return std::make_pair(0U, &AArch64::FPR32RegClass);
if (VT.getSizeInBits() == 64)
return std::make_pair(0U, &AArch64::FPR64RegClass);
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128RegClass);
// The instructions that this constraint is designed for can
// only take 128-bit registers so just use that regclass.
case 'x':
if (!Subtarget->hasFPARMv8())
if (VT.isScalableVector())
return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128_loRegClass);
case 'y':
if (!Subtarget->hasFPARMv8())
if (VT.isScalableVector())
return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
} else {
PredicateConstraint PC = parsePredicateConstraint(Constraint);
if (PC != PredicateConstraint::Invalid) {
bool restricted = (PC == PredicateConstraint::Upl);
return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
: std::make_pair(0U, &AArch64::PPRRegClass);
if (StringRef("{cc}").equals_lower(Constraint))
return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass *> Res;
Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
unsigned Size = Constraint.size();
if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
int RegNo;
bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
if (!Failed && RegNo >= 0 && RegNo <= 31) {
// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
// By default we'll emit v0-v31 for this unless there's a modifier where
// we'll emit the correct register as well.
if (VT != MVT::Other && VT.getSizeInBits() == 64) {
Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
Res.second = &AArch64::FPR64RegClass;
} else {
Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
Res.second = &AArch64::FPR128RegClass;
if (Res.second && !Subtarget->hasFPARMv8() &&
!AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
return std::make_pair(0U, nullptr);
return Res;
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void AArch64TargetLowering::LowerAsmOperandForConstraint(
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
SDValue Result;
// Currently only support length 1 constraints.
if (Constraint.length() != 1)
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
// This set of constraints deal with valid constants for various instructions.
// Validate and return a target constant for them if we can.
case 'z': {
// 'z' maps to xzr or wzr so it needs an input of 0.
if (!isNullConstant(Op))
if (Op.getValueType() == MVT::i64)
Result = DAG.getRegister(AArch64::XZR, MVT::i64);
Result = DAG.getRegister(AArch64::WZR, MVT::i32);
case 'S': {
// An absolute symbolic address or label reference.
if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
} else if (const BlockAddressSDNode *BA =
dyn_cast<BlockAddressSDNode>(Op)) {
Result =
DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
} else if (const ExternalSymbolSDNode *ES =
dyn_cast<ExternalSymbolSDNode>(Op)) {
Result =
DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
} else
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C)
// Grab the value and do some validation.
uint64_t CVal = C->getZExtValue();
switch (ConstraintLetter) {
// The I constraint applies only to simple ADD or SUB immediate operands:
// i.e. 0 to 4095 with optional shift by 12
// The J constraint applies only to ADD or SUB immediates that would be
// valid when negated, i.e. if [an add pattern] were to be output as a SUB
// instruction [or vice versa], in other words -1 to -4095 with optional
// left shift by 12.
case 'I':
if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
case 'J': {
uint64_t NVal = -C->getSExtValue();
if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
CVal = C->getSExtValue();
// The K and L constraints apply *only* to logical immediates, including
// what used to be the MOVI alias for ORR (though the MOVI alias has now
// been removed and MOV should be used). So these constraints have to
// distinguish between bit patterns that are valid 32-bit or 64-bit
// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
// versa.
case 'K':
if (AArch64_AM::isLogicalImmediate(CVal, 32))
case 'L':
if (AArch64_AM::isLogicalImmediate(CVal, 64))
// The M and N constraints are a superset of K and L respectively, for use
// with the MOV (immediate) alias. As well as the logical immediates they
// also match 32 or 64-bit immediates that can be loaded either using a
// *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
// (M) or 64-bit 0x1234000000000000 (N) etc.
// As a note some of this code is liberally stolen from the asm parser.
case 'M': {
if (!isUInt<32>(CVal))
if (AArch64_AM::isLogicalImmediate(CVal, 32))
if ((CVal & 0xFFFF) == CVal)
if ((CVal & 0xFFFF0000ULL) == CVal)
uint64_t NCVal = ~(uint32_t)CVal;
if ((NCVal & 0xFFFFULL) == NCVal)
if ((NCVal & 0xFFFF0000ULL) == NCVal)
case 'N': {
if (AArch64_AM::isLogicalImmediate(CVal, 64))
if ((CVal & 0xFFFFULL) == CVal)
if ((CVal & 0xFFFF0000ULL) == CVal)
if ((CVal & 0xFFFF00000000ULL) == CVal)
if ((CVal & 0xFFFF000000000000ULL) == CVal)
uint64_t NCVal = ~CVal;
if ((NCVal & 0xFFFFULL) == NCVal)
if ((NCVal & 0xFFFF0000ULL) == NCVal)
if ((NCVal & 0xFFFF00000000ULL) == NCVal)
if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
// All assembler immediates are 64-bit integers.
Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
if (Result.getNode()) {
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
// AArch64 Advanced SIMD Support
/// WidenVector - Given a value in the V64 register class, produce the
/// equivalent value in the V128 register class.
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
EVT VT = V64Reg.getValueType();
unsigned NarrowSize = VT.getVectorNumElements();
MVT EltTy = VT.getVectorElementType().getSimpleVT();
MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
SDLoc DL(V64Reg);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
V64Reg, DAG.getConstant(0, DL, MVT::i32));
/// getExtFactor - Determine the adjustment factor for the position when
/// generating an "extract from vector registers" instruction.
static unsigned getExtFactor(SDValue &V) {
EVT EltType = V.getValueType().getVectorElementType();
return EltType.getSizeInBits() / 8;
/// NarrowVector - Given a value in the V128 register class, produce the
/// equivalent value in the V64 register class.
static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
EVT VT = V128Reg.getValueType();
unsigned WideSize = VT.getVectorNumElements();
MVT EltTy = VT.getVectorElementType().getSimpleVT();
MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
SDLoc DL(V128Reg);
return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
SDLoc dl(Op);
EVT VT = Op.getValueType();
unsigned NumElts = VT.getVectorNumElements();
struct ShuffleSourceInfo {
SDValue Vec;
unsigned MinElt;
unsigned MaxElt;
// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
// be compatible with the shuffle we intend to construct. As a result
// ShuffleVec will be some sliding window into the original Vec.
SDValue ShuffleVec;
// Code should guarantee that element i in Vec starts at element "WindowBase
// + i * WindowScale in ShuffleVec".
int WindowBase;
int WindowScale;
ShuffleSourceInfo(SDValue Vec)
: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
// First gather all vectors used as an immediate source for this BUILD_VECTOR
// node.
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(V.getOperand(1))) {
dbgs() << "Reshuffle failed: "
"a shuffle can only come from building a vector from "
"various elements of other vectors, provided their "
"indices are constant\n");
return SDValue();
// Add this element source to the list if it's not already there.
SDValue SourceVec = V.getOperand(0);
auto Source = find(Sources, SourceVec);
if (Source == Sources.end())
Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
// Update the minimum and maximum lane number seen.
unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
Source->MinElt = std::min(Source->MinElt, EltNo);
Source->MaxElt = std::max(Source->MaxElt, EltNo);
if (Sources.size() > 2) {
dbgs() << "Reshuffle failed: currently only do something sane when at "
"most two source vectors are involved\n");
return SDValue();
// Find out the smallest element size among result and two sources, and use
// it as element size to build the shuffle_vector.
EVT SmallestEltTy = VT.getVectorElementType();
for (auto &Source : Sources) {
EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
if (SrcEltTy.bitsLT(SmallestEltTy)) {
SmallestEltTy = SrcEltTy;
unsigned ResMultiplier =
VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
// If the source vector is too wide or too narrow, we may nevertheless be able
// to construct a compatible shuffle either by concatenating it with UNDEF or
// extracting a suitable range of elements.
for (auto &Src : Sources) {
EVT SrcVT = Src.ShuffleVec.getValueType();
if (SrcVT.getSizeInBits() == VT.getSizeInBits())
// This stage of the search produces a source with the same element type as
// the original, but with a total width matching the BUILD_VECTOR output.
EVT EltVT = SrcVT.getVectorElementType();
unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
Src.ShuffleVec =
DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
return SDValue();
if (Src.MinElt >= NumSrcElts) {
// The extraction can just take the second half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i64));
Src.WindowBase = -NumSrcElts;
} else if (Src.MaxElt < NumSrcElts) {
// The extraction can just take the first half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i64));
} else {
// An actual VEXT is needed
SDValue VEXTSrc1 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i64));
SDValue VEXTSrc2 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i64));
unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
DAG.getConstant(Imm, dl, MVT::i32));
Src.WindowBase = -Src.MinElt;
// Another possible incompatibility occurs from the vector element types. We
// can fix this by bitcasting the source vectors to the same type we intend
// for the shuffle.
for (auto &Src : Sources) {
EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
if (SrcEltTy == SmallestEltTy)
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;
// Final sanity check before we try to actually produce a shuffle.
LLVM_DEBUG(for (auto Src
: Sources)
assert(Src.ShuffleVec.getValueType() == ShuffleVT););
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
if (Entry.isUndef())
auto Src = find(Sources, Entry.getOperand(0));
int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
// segment.
EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
int BitsDefined =
std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
int LanesDefined = BitsDefined / BitsPerShuffleLane;
// This source is expected to fill ResMultiplier lanes of the final shuffle,
// starting at the appropriate offset.
int *LaneMask = &Mask[i * ResMultiplier];
int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
ExtractBase += NumElts * (Src - Sources.begin());
for (int j = 0; j < LanesDefined; ++j)
LaneMask[j] = ExtractBase + j;
// Final check before we try to produce nonsense...
if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
return SDValue();
SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
for (unsigned i = 0; i < Sources.size(); ++i)
ShuffleOps[i] = Sources[i].ShuffleVec;
SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
ShuffleOps[1], Mask);
SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
dbgs() << "Reshuffle, creating node: "; V.dump(););
return V;
// check if an EXT instruction can handle the shuffle mask when the
// vector sources of the shuffle are the same.
static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
unsigned NumElts = VT.getVectorNumElements();
// Assume that the first shuffle index is not UNDEF. Fail if it is.
if (M[0] < 0)
return false;
Imm = M[0];
// If this is a VEXT shuffle, the immediate value is the index of the first
// element. The other shuffle indices must be the successive elements after
// the first one.
unsigned ExpectedElt = Imm;
for (unsigned i = 1; i < NumElts; ++i) {
// Increment the expected index. If it wraps around, just follow it
// back to index zero and keep going.
if (ExpectedElt == NumElts)
ExpectedElt = 0;
if (M[i] < 0)
continue; // ignore UNDEF indices
if (ExpectedElt != static_cast<unsigned>(M[i]))
return false;
return true;
// check if an EXT instruction can handle the shuffle mask when the
// vector sources of the shuffle are different.
static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
unsigned &Imm) {
// Look for the first non-undef element.
const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
// Benefit form APInt to handle overflow when calculating expected element.
unsigned NumElts = VT.getVectorNumElements();
unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
// The following shuffle indices must be the successive elements after the
// first real element.
const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
if (FirstWrongElt != M.end())
return false;
// The index of an EXT is the first element if it is not UNDEF.
// Watch out for the beginning UNDEFs. The EXT index should be the expected
// value of the first element. E.g.
// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
// <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
// ExpectedElt is the last mask index plus 1.
Imm = ExpectedElt.getZExtValue();
// There are two difference cases requiring to reverse input vectors.
// For example, for vector <4 x i32> we have the following cases,
// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
// For both cases, we finally use mask <5, 6, 7, 0>, which requires
// to reverse two input vectors.
if (Imm < NumElts)
ReverseEXT = true;
Imm -= NumElts;
return true;
/// isREVMask - Check if a vector shuffle corresponds to a REV
/// instruction with the specified blocksize. (The order of the elements
/// within each block of the vector is reversed.)
static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
"Only possible block sizes for REV are: 16, 32, 64");
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
unsigned BlockElts = M[0] + 1;
// If the first shuffle index is UNDEF, be optimistic.
if (M[0] < 0)
BlockElts = BlockSize / EltSz;
if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
return false;
for (unsigned i = 0; i < NumElts; ++i) {
if (M[i] < 0)
continue; // ignore UNDEF indices
if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
return false;
return true;
static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
if (NumElts % 2 != 0)
return false;
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
return false;
Idx += 1;
return true;
static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i != NumElts; ++i) {
if (M[i] < 0)
continue; // ignore UNDEF indices
if ((unsigned)M[i] != 2 * i + WhichResult)
return false;
return true;
static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
if (NumElts % 2 != 0)
return false;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
return false;
return true;
/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
if (NumElts % 2 != 0)
return false;
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
return false;
Idx += 1;
return true;
/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned Half = VT.getVectorNumElements() / 2;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned j = 0; j != 2; ++j) {
unsigned Idx = WhichResult;
for (unsigned i = 0; i != Half; ++i) {
int MIdx = M[i + j * Half];
if (MIdx >= 0 && (unsigned)MIdx != Idx)
return false;
Idx += 2;
return true;
/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
if (NumElts % 2 != 0)
return false;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
return false;
return true;
static bool isINSMask(ArrayRef<int> M, int NumInputElements,
bool &DstIsLeft, int &Anomaly) {
if (M.size() != static_cast<size_t>(NumInputElements))
return false;
int NumLHSMatch = 0, NumRHSMatch = 0;
int LastLHSMismatch = -1, LastRHSMismatch = -1;
for (int i = 0; i < NumInputElements; ++i) {
if (M[i] == -1) {
if (M[i] == i)
LastLHSMismatch = i;
if (M[i] == i + NumInputElements)
LastRHSMismatch = i;
if (NumLHSMatch == NumInputElements - 1) {
DstIsLeft = true;
Anomaly = LastLHSMismatch;
return true;
} else if (NumRHSMatch == NumInputElements - 1) {
DstIsLeft = false;
Anomaly = LastRHSMismatch;
return true;
return false;
static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
if (VT.getSizeInBits() != 128)
return false;
unsigned NumElts = VT.getVectorNumElements();
for (int I = 0, E = NumElts / 2; I != E; I++) {
if (Mask[I] != I)
return false;
int Offset = NumElts / 2;
for (int I = NumElts / 2, E = NumElts; I != E; I++) {
if (Mask[I] != I + SplitLHS * Offset)
return false;
return true;
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue V0 = Op.getOperand(0);
SDValue V1 = Op.getOperand(1);
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
VT.getVectorElementType() != V1.getValueType().getVectorElementType())
return SDValue();
bool SplitV0 = V0.getValueSizeInBits() == 128;
if (!isConcatMask(Mask, VT, SplitV0))
return SDValue();
EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
if (SplitV0) {
DAG.getConstant(0, DL, MVT::i64));
if (V1.getValueSizeInBits() == 128) {
DAG.getConstant(0, DL, MVT::i64));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
enum {
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
OP_VUZPL, // VUZP, left result
OP_VUZPR, // VUZP, right result
OP_VZIPL, // VZIP, left result
OP_VZIPR, // VZIP, right result
OP_VTRNL, // VTRN, left result
OP_VTRNR // VTRN, right result
if (OpNum == OP_COPY) {
if (LHSID == (1 * 9 + 2) * 9 + 3)
return LHS;
assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
return RHS;
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
EVT VT = OpLHS.getValueType();
switch (OpNum) {
llvm_unreachable("Unknown shuffle opcode!");
case OP_VREV:
// VREV divides the vector in half and swaps within the half.
if (VT.getVectorElementType() == MVT::i32 ||
VT.getVectorElementType() == MVT::f32)
return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
// vrev <4 x i16> -> REV32
if (VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::f16 ||
VT.getVectorElementType() == MVT::bf16)
return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
// vrev <4 x i8> -> REV16
assert(VT.getVectorElementType() == MVT::i8);
return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
case OP_VDUP0:
case OP_VDUP1:
case OP_VDUP2:
case OP_VDUP3: {
EVT EltTy = VT.getVectorElementType();
unsigned Opcode;
if (EltTy == MVT::i8)
Opcode = AArch64ISD::DUPLANE8;
else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
Opcode = AArch64ISD::DUPLANE16;
else if (EltTy == MVT::i32 || EltTy == MVT::f32)
Opcode = AArch64ISD::DUPLANE32;
else if (EltTy == MVT::i64 || EltTy == MVT::f64)
Opcode = AArch64ISD::DUPLANE64;
llvm_unreachable("Invalid vector element type?");
if (VT.getSizeInBits() == 64)
OpLHS = WidenVector(OpLHS, DAG);
SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
case OP_VEXT1:
case OP_VEXT2:
case OP_VEXT3: {
unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
DAG.getConstant(Imm, dl, MVT::i32));
case OP_VUZPL:
return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
case OP_VUZPR:
return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
case OP_VZIPL:
return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
case OP_VZIPR:
return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
case OP_VTRNL:
return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
case OP_VTRNR:
return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
SelectionDAG &DAG) {
// Check to see if we can use the TBL instruction.
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
SDLoc DL(Op);
EVT EltVT = Op.getValueType().getVectorElementType();
unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
SmallVector<SDValue, 8> TBLMask;
for (int Val : ShuffleMask) {
for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
unsigned Offset = Byte + Val * BytesPerElt;
TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
MVT IndexVT = MVT::v8i8;
unsigned IndexLen = 8;
if (Op.getValueSizeInBits() == 128) {
IndexVT = MVT::v16i8;
IndexLen = 16;
SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
SDValue Shuffle;
if (V2.getNode()->isUndef()) {
if (IndexLen == 8)
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
Shuffle = DAG.getNode(
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
DAG.getBuildVector(IndexVT, DL,
makeArrayRef(, IndexLen)));
} else {
if (IndexLen == 8) {
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
Shuffle = DAG.getNode(
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
DAG.getBuildVector(IndexVT, DL,
makeArrayRef(, IndexLen)));
} else {
// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
// cannot currently represent the register constraints on the input
// table registers.
// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
// IndexLen));
Shuffle = DAG.getNode(
DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
V2Cst, DAG.getBuildVector(IndexVT, DL,
makeArrayRef(, IndexLen)));
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
static unsigned getDUPLANEOp(EVT EltType) {
if (EltType == MVT::i8)
return AArch64ISD::DUPLANE8;
if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
return AArch64ISD::DUPLANE16;
if (EltType == MVT::i32 || EltType == MVT::f32)
return AArch64ISD::DUPLANE32;
if (EltType == MVT::i64 || EltType == MVT::f64)
return AArch64ISD::DUPLANE64;
llvm_unreachable("Invalid vector element type?");
SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT VT = Op.getValueType();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
// Convert shuffles that are directly supported on NEON to target-specific
// DAG nodes, instead of keeping them as shuffles and matching them again
// during code selection. This is more efficient and avoids the possibility
// of inconsistencies between legalization and selection.
ArrayRef<int> ShuffleMask = SVN->getMask();
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.
if (Lane == -1)
Lane = 0;
if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
// constant. If so, we can just reference the lane's definition directly.
if (V1.getOpcode() == ISD::BUILD_VECTOR &&
return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
// Otherwise, duplicate from the lane of the input vector.
unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
// Try to eliminate a bitcasted extract subvector before a DUPLANE.
auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
// Match: dup (bitcast (extract_subv X, C)), LaneC
if (BitCast.getOpcode() != ISD::BITCAST ||
BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;
// The extract index must align in the destination type. That may not
// happen if the bitcast is from narrow to wide type.
SDValue Extract = BitCast.getOperand(0);
unsigned ExtIdx = Extract.getConstantOperandVal(1);
unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
if (ExtIdxInBits % CastedEltBitWidth != 0)
return false;
// Update the lane value by offsetting with the scaled extract index.
LaneC += ExtIdxInBits / CastedEltBitWidth;
// Determine the casted vector type of the wide vector input.
// dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
// Examples:
// dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
// dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
unsigned SrcVecNumElts =
Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
return true;
if (getScaledOffsetDup(V1, Lane, CastVT)) {
V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
} else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
// The lane is incremented by the index of the extract.
// Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
Lane += V1.getConstantOperandVal(1);
V1 = V1.getOperand(0);
} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
// The lane is decremented if we are splatting from the 2nd operand.
// Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
Lane -= Idx * VT.getVectorNumElements() / 2;
V1 = WidenVector(V1.getOperand(Idx), DAG);
} else if (VT.getSizeInBits() == 64) {
// Widen the operand to 128-bit register with undef.
V1 = WidenVector(V1, DAG);
return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
if (isREVMask(ShuffleMask, VT, 64))
return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
if (isREVMask(ShuffleMask, VT, 32))
return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
if (isREVMask(ShuffleMask, VT, 16))
return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
bool ReverseEXT = false;
unsigned Imm;
if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
if (ReverseEXT)
std::swap(V1, V2);
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
DAG.getConstant(Imm, dl, MVT::i32));
} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
unsigned WhichResult;
if (isZIPMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
if (isUZPMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
if (isTRNMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
return Concat;
bool DstIsLeft;
int Anomaly;
int NumInputElements = V1.getValueType().getVectorNumElements();
if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
SDValue DstVec = DstIsLeft ? V1 : V2;
SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
SDValue SrcVec = V1;
int SrcLane = ShuffleMask[Anomaly];
if (SrcLane >= NumInputElements) {
SrcVec = V2;
SrcLane -= VT.getVectorNumElements();
SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
EVT ScalarVT = VT.getVectorElementType();
if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
ScalarVT = MVT::i32;
return DAG.getNode(
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
// If the shuffle is not directly supported and it has 4 elements, use
// the PerfectShuffle-generated table to synthesize it from other shuffles.
unsigned NumElts = VT.getVectorNumElements();
if (NumElts == 4) {
unsigned PFIndexes[4];
for (unsigned i = 0; i != 4; ++i) {
if (ShuffleMask[i] < 0)
PFIndexes[i] = 8;
PFIndexes[i] = ShuffleMask[i];
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4)
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
return GenerateTBL(Op, ShuffleMask, DAG);
SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT VT = Op.getValueType();
EVT ElemVT = VT.getScalarType();
SDValue SplatVal = Op.getOperand(0);
// Extend input splat value where needed to fit into a GPR (32b or 64b only)
// FPRs don't have this restriction.
switch (ElemVT.getSimpleVT().SimpleTy) {
case MVT::i1: {
// The only legal i1 vectors are SVE vectors, so we can use SVE-specific
// lowering code.
if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
if (ConstVal->isOne())
return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
// TODO: Add special case for constant false
// The general case of i1. There isn't any natural way to do this,
// so we use some trickery with whilelo.
SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
DAG.getConstant(0, dl, MVT::i64), SplatVal);
case MVT::i8:
case MVT::i16:
case MVT::i32:
SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
case MVT::i64:
SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
case MVT::f16:
case MVT::bf16:
case MVT::f32:
case MVT::f64:
// Fine as is
report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (!isTypeLegal(VT) || !VT.isScalableVector())
return SDValue();
// Current lowering only supports the SVE-ACLE types.
if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
return SDValue();
// The DUPQ operation is indepedent of element type so normalise to i64s.
SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
SDValue Idx128 = Op.getOperand(2);
// DUPQ can be used when idx is in range.
auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
if (CIdx && (CIdx->getZExtValue() <= 3)) {
SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
SDNode *DUPQ =
DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
// The ACLE says this must produce the same result as:
// svtbl(data, svadd_x(svptrue_b64(),
// svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
// index * 2))
SDValue One = DAG.getConstant(1, DL, MVT::i64);
SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
// create the vector 0,1,0,1,...
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
DL, MVT::nxv2i64, Zero, One);
SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
// create the vector idx64,idx64+1,idx64,idx64+1,...
SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
// create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {
EVT VT = BVN->getValueType(0);
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
for (unsigned i = 0; i < NumSplats; ++i) {
CnstBits <<= SplatBitSize;
UndefBits <<= SplatBitSize;
CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
return true;
return false;
// Try 64-bit splatted SIMD immediate.
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
SDLoc dl(Op);
SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
return SDValue();
// Try 32-bit splatted SIMD immediate.
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits,
const SDValue *LHS = nullptr) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
bool isAdvSIMDModImm = false;
uint64_t Shift;
if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
Shift = 0;
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
Shift = 8;
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
Shift = 16;
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
Shift = 24;
if (isAdvSIMDModImm) {
SDLoc dl(Op);
SDValue Mov;
if (LHS)
Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
return SDValue();
// Try 16-bit splatted SIMD immediate.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits,
const SDValue *LHS = nullptr) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
bool isAdvSIMDModImm = false;
uint64_t Shift;
if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
Shift = 0;
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
Shift = 8;
if (isAdvSIMDModImm) {
SDLoc dl(Op);
SDValue Mov;
if (LHS)
Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
return SDValue();
// Try 32-bit splatted SIMD immediate with shifted ones.
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
SelectionDAG &DAG, const APInt &Bits) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
bool isAdvSIMDModImm = false;
uint64_t Shift;
if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
Shift = 264;
else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
Shift = 272;
if (isAdvSIMDModImm) {
SDLoc dl(Op);
SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32),
DAG.getConstant(Shift, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
return SDValue();
// Try 8-bit splatted SIMD immediate.
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
SDLoc dl(Op);
SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
return SDValue();
// Try FP splatted SIMD immediate.
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
const APInt &Bits) {
if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
EVT VT = Op.getValueType();
bool isWide = (VT.getSizeInBits() == 128);
MVT MovTy;
bool isAdvSIMDModImm = false;
if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
else if (isWide &&
(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
MovTy = MVT::v2f64;
if (isAdvSIMDModImm) {
SDLoc dl(Op);
SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
DAG.getConstant(Value, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
return SDValue();
// Specialized code to quickly find if PotentialBVec is a BuildVector that
// consists of only the same constant int value, returned in reference arg
// ConstVal
static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
uint64_t &ConstVal) {
BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
if (!Bvec)
return false;
ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
if (!FirstElt)
return false;
EVT VT = Bvec->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
for (unsigned i = 1; i < NumElts; ++i)
if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
return false;
ConstVal = FirstElt->getZExtValue();
return true;
static unsigned getIntrinsicID(const SDNode *N) {
unsigned Opcode = N->getOpcode();
switch (Opcode) {
return Intrinsic::not_intrinsic;
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
if (IID < Intrinsic::num_intrinsics)
return IID;
return Intrinsic::not_intrinsic;
// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
// BUILD_VECTORs with constant element C1, C2 is a constant, and:
// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (!VT.isVector())
return SDValue();
SDLoc DL(N);
SDValue And;
SDValue Shift;
SDValue FirstOp = N->getOperand(0);
unsigned FirstOpc = FirstOp.getOpcode();
SDValue SecondOp = N->getOperand(1);
unsigned SecondOpc = SecondOp.getOpcode();
// Is one of the operands an AND or a BICi? The AND may have been optimised to
// a BICi in order to use an immediate instead of a register.
// Is the other operand an shl or lshr? This will have been turned into:
// AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
(SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
And = FirstOp;
Shift = SecondOp;
} else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
(FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
And = SecondOp;
Shift = FirstOp;
} else
return SDValue();
bool IsAnd = And.getOpcode() == ISD::AND;
bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
// Is the shift amount constant?
ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
if (!C2node)
return SDValue();
uint64_t C1;
if (IsAnd) {
// Is the and mask vector all constant?
if (!isAllConstantBuildVector(And.getOperand(1), C1))
return SDValue();
} else {
// Reconstruct the corresponding AND immediate from the two BICi immediates.
ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
assert(C1nodeImm && C1nodeShift);
C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
// Is C1 == ~(Ones(ElemSizeInBits) << C2) or
// C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
// how much one can shift elements of a particular size?
uint64_t C2 = C2node->getZExtValue();
unsigned ElemSizeInBits = VT.getScalarSizeInBits();
if (C2 > ElemSizeInBits)
return SDValue();
APInt C1AsAPInt(ElemSizeInBits, C1);
APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
: APInt::getLowBitsSet(ElemSizeInBits, C2);
if (C1AsAPInt != RequiredC1)
return SDValue();
SDValue X = And.getOperand(0);
SDValue Y = Shift.getOperand(0);
unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
LLVM_DEBUG(dbgs() << "into: \n");
return ResultSLI;
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
return Res;
EVT VT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
BuildVectorSDNode *BVN =
if (!BVN) {
// OR commutes, so try swapping the operands.
LHS = Op.getOperand(1);
BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
if (!BVN)
return Op;
APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
if (resolveBuildVector(BVN, DefBits, UndefBits)) {
SDValue NewOp;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
DefBits, &LHS)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
DefBits, &LHS)))
return NewOp;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
UndefBits, &LHS)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
UndefBits, &LHS)))
return NewOp;
// We can always fall back to a non-immediate OR.
return Op;
// Normalize the operands of BUILD_VECTOR. The value of constant operands will
// be truncated to fit element width.
static SDValue NormalizeBuildVector(SDValue Op,
SelectionDAG &DAG) {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
SDLoc dl(Op);
EVT VT = Op.getValueType();
EVT EltTy= VT.getVectorElementType();
if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
return Op;
SmallVector<SDValue, 16> Ops;
for (SDValue Lane : Op->ops()) {
// For integer vectors, type legalization would have promoted the
// operands already. Otherwise, if Op is a floating-point splat
// (with operands cast to integers), then the only possibilities
// are constants and UNDEFs.
if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
APInt LowBits(EltTy.getSizeInBits(),
Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
} else if (Lane.getNode()->isUndef()) {
Lane = DAG.getUNDEF(MVT::i32);
} else {
assert(Lane.getValueType() == MVT::i32 &&
"Unexpected BUILD_VECTOR operand type");
return DAG.getBuildVector(VT, dl, Ops);
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
if (resolveBuildVector(BVN, DefBits, UndefBits)) {
SDValue NewOp;
if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
return NewOp;
DefBits = ~DefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
return NewOp;
DefBits = UndefBits;
if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
return NewOp;
DefBits = ~UndefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
return NewOp;
return SDValue();
SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
// Try to build a simple constant vector.
Op = NormalizeBuildVector(Op, DAG);
if (VT.isInteger()) {
// Certain vector constants, used to express things like logical NOT and
// arithmetic NEG, are passed through unmodified. This allows special
// patterns for these operations to match, which will lower these constants
// to whatever is proven necessary.
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
if (BVN->isConstant())
if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
unsigned BitSize = VT.getVectorElementType().getSizeInBits();
APInt Val(BitSize,
if (Val.isNullValue() || Val.isAllOnesValue())
return Op;
if (SDValue V = ConstantBuildVector(Op, DAG))
return V;
// Scan through the operands to find some interesting properties we can
// exploit:
// 1) If only one value is used, we can use a DUP, or
// 2) if only the low element is not undef, we can just insert that, or
// 3) if only one constant value is used (w/ some non-constant lanes),
// we can splat the constant value into the whole vector then fill
// in the non-constant lanes.
// 4) FIXME: If different constant values are used, but we can intelligently
// select the values we'll be overwriting for the non-constant
// lanes such that we can directly materialize the vector
// some other way (MOVI, e.g.), we can be sneaky.
// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
SDLoc dl(Op);
unsigned NumElts = VT.getVectorNumElements();
bool isOnlyLowElement = true;
bool usesOnlyOneValue = true;
bool usesOnlyOneConstantValue = true;
bool isConstant = true;
bool AllLanesExtractElt = true;
unsigned NumConstantLanes = 0;
SDValue Value;
SDValue ConstantValue;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
AllLanesExtractElt = false;
if (V.isUndef())
if (i > 0)
isOnlyLowElement = false;
if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
isConstant = false;
if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
if (!ConstantValue.getNode())
ConstantValue = V;
else if (ConstantValue != V)
usesOnlyOneConstantValue = false;
if (!Value.getNode())
Value = V;
else if (V != Value)
usesOnlyOneValue = false;
if (!Value.getNode()) {
dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
return DAG.getUNDEF(VT);
// Convert BUILD_VECTOR where all elements but the lowest are undef into
// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
if (AllLanesExtractElt) {
SDNode *Vector = nullptr;
bool Even = false;
bool Odd = false;
// Check whether the extract elements match the Even pattern <0,2,4,...> or
// the Odd pattern <1,3,5,...>.
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
const SDNode *N = V.getNode();
if (!isa<ConstantSDNode>(N->getOperand(1)))
SDValue N0 = N->getOperand(0);
// All elements are extracted from the same vector.
if (!Vector) {
Vector = N0.getNode();
// Check that the type of EXTRACT_VECTOR_ELT matches the type of
if (VT.getVectorElementType() !=
} else if (Vector != N0.getNode()) {
Odd = false;
Even = false;
// Extracted values are either at Even indices <0,2,4,...> or at Odd
// indices <1,3,5,...>.
uint64_t Val = N->getConstantOperandVal(1);
if (Val == 2 * i) {
Even = true;
if (Val - 1 == 2 * i) {
Odd = true;
// Something does not match: abort.
Odd = false;
Even = false;
if (Even || Odd) {
SDValue LHS =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
DAG.getConstant(0, dl, MVT::i64));
SDValue RHS =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
DAG.getConstant(NumElts, dl, MVT::i64));
if (Even && !Odd)
return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
if (Odd && !Even)
return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
// Use DUP for non-constant splats. For f32 constant splats, reduce to
// i32 and try again.
if (usesOnlyOneValue) {
if (!isConstant) {
if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Value.getValueType() != VT) {
dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
// This is actually a DUPLANExx operation, which keeps everything vectory.
SDValue Lane = Value.getOperand(1);
Value = Value.getOperand(0);
if (Value.getValueSizeInBits() == 64) {
dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
"widening it\n");
Value = WidenVector(Value, DAG);
unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
return DAG.getNode(Opcode, dl, VT, Value, Lane);
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
EVT EltTy = VT.getVectorElementType();
assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
EltTy == MVT::f64) && "Unsupported floating-point vector type");
dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
"BITCASTS, and try again\n");
MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
Val = LowerBUILD_VECTOR(Val, DAG);
if (Val.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
// If there was only one constant value used and for more than one lane,
// start by splatting that value, then replace the non-constant lanes. This
// is better than the default, which will perform a separate initialization
// for each lane.
if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
// Firstly, try to materialize the splat constant.
SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
Val = ConstantBuildVector(Vec, DAG);
if (!Val) {
// Otherwise, materialize the constant and splat it.
Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
// Now insert the non-constant lanes.
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
// Note that type legalization likely mucked about with the VT of the
// source operand, so we may have to convert it here before inserting.
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
return Val;
// This will generate a load from the constant pool.
if (isConstant) {
dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
return SDValue();
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
if (SDValue shuffle = ReconstructShuffle(Op, DAG))
return shuffle;
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
// know the default expansion would otherwise fall back on something even
// worse. For a vector with one or two non-undef values, that's
// scalar_to_vector for the elements followed by a shuffle (provided the
// shuffle is valid for the target) and materialization element by element
// on the stack followed by a load for everything else.
if (!isConstant && !usesOnlyOneValue) {
dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
SDValue Vec = DAG.getUNDEF(VT);
SDValue Op0 = Op.getOperand(0);
unsigned i = 0;
// Use SCALAR_TO_VECTOR for lane zero to
// a) Avoid a RMW dependency on the full vector register, and
// b) Allow the register coalescer to fold away the copy if the
// value is already in an S or D register, and we're forced to emit an
// INSERT_SUBREG that we can't fold anywhere.
// We also allow types like i8 and i16 which are illegal scalar but legal
// vector element types. After type-legalization the inserted value is
// extended (i32) and it is safe to cast them to the vector type by ignoring
// the upper bits of the lowest lane (e.g. v8i8, v4i16).
if (!Op0.isUndef()) {
LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
LLVM_DEBUG(if (i < NumElts) dbgs()
<< "Creating nodes for the other vector elements:\n";);
for (; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
return Vec;
dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
"better alternative\n");
return SDValue();
SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
// Check for non-constant or out of range lane.
EVT VT = Op.getOperand(0).getValueType();
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
VT == MVT::v8f16 || VT == MVT::v8bf16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
VT != MVT::v4bf16)
return SDValue();
// For V64 types, we perform insertion by expanding the value
// to a V128 type and perform the insertion on that.
SDLoc DL(Op);
SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
EVT WideTy = WideVec.getValueType();
SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
Op.getOperand(1), Op.getOperand(2));
// Re-narrow the resultant vector.
return NarrowVector(Node, DAG);
AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
// Check for non-constant or out of range lane.
EVT VT = Op.getOperand(0).getValueType();
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
VT == MVT::v8f16 || VT == MVT::v8bf16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
VT != MVT::v4bf16)
return SDValue();
// For V64 types, we perform extraction by expanding the value
// to a V128 type and perform the extraction on that.
SDLoc DL(Op);
SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
EVT WideTy = WideVec.getValueType();
EVT ExtrTy = WideTy.getVectorElementType();
if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
ExtrTy = MVT::i32;
// For extractions, we just return the result directly.
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getValueType().isFixedLengthVector() &&
"Only cases that extract a fixed length vector are supported!");
EVT InVT = Op.getOperand(0).getValueType();
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
unsigned Size = Op.getValueSizeInBits();
if (InVT.isScalableVector()) {
// This will be matched by custom code during ISelDAGToDAG.
if (Idx == 0 && isPackedVectorType(InVT, DAG))
return Op;
return SDValue();
// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
if (Idx == 0 && InVT.getSizeInBits() <= 128)
return Op;
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
return Op;
return SDValue();
SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getValueType().isScalableVector() &&
"Only expect to lower inserts into scalable vectors!");
EVT InVT = Op.getOperand(1).getValueType();
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
// We don't have any patterns for scalable vector yet.
if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT))
return SDValue();
// This will be matched by custom code during ISelDAGToDAG.
if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
return Op;
return SDValue();
bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
// Currently no fixed length shuffles that require SVE are legal.
if (useSVEForFixedLengthVectorVT(VT))
return false;
if (VT.getVectorNumElements() == 4 &&
(VT.is128BitVector() || VT.is64BitVector())) {
unsigned PFIndexes[4];
for (unsigned i = 0; i != 4; ++i) {
if (M[i] < 0)
PFIndexes[i] = 8;
PFIndexes[i] = M[i];
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4)
return true;
bool DummyBool;
int DummyInt;
unsigned DummyUnsigned;
return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
// isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
isZIPMask(M, VT, DummyUnsigned) ||
isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
isConcatMask(M, VT, VT.getSizeInBits() == 128));
/// getVShiftImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
// Ignore bit_converts.
while (Op.getOpcode() == ISD::BITCAST)
Op = Op.getOperand(0);
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
HasAnyUndefs, ElementBits) ||
SplatBitSize > ElementBits)
return false;
Cnt = SplatBits.getSExtValue();
return true;
/// isVShiftLImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift left operation. That value must be in the range:
/// 0 <= Value < ElementBits for a left shift; or
/// 0 <= Value <= ElementBits for a long left shift.
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
/// isVShiftRImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift right operation. The value must be in the range:
/// 1 <= Value <= ElementBits for a right shift; or
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
// Attempt to form urhadd(OpA, OpB) from
// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
// The original form of this expression is
// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
// is called the srl will have been lowered to AArch64ISD::VLSHR and the
// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
// This pass can also recognize a variant of this pattern that uses sign
// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
+ if (VT.getScalarType() == MVT::i1) {
+ // Lower i1 truncate to `(x & 1) != 0`.
+ SDLoc dl(Op);
+ EVT OpVT = Op.getOperand(0).getValueType();
+ SDValue Zero = DAG.getConstant(0, dl, OpVT);
+ SDValue One = DAG.getConstant(1, dl, OpVT);
+ SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
+ return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
+ }
if (!VT.isVector() || VT.isScalableVector())
return Op;
if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
// Since we are looking for a right shift by a constant value of 1 and we are
// operating on types at least 16 bits in length (sign/zero extended OpA and
// OpB, which are at least 8 bits), it follows that the truncate will always
// discard the shifted-in bit and therefore the right shift will be logical
// regardless of the signedness of OpA and OpB.
SDValue Shift = Op.getOperand(0);
if (Shift.getOpcode() != AArch64ISD::VLSHR)
return Op;
// Is the right shift using an immediate value of 1?
uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
if (ShiftAmount != 1)
return Op;
SDValue Sub = Shift->getOperand(0);
if (Sub.getOpcode() != ISD::SUB)
return Op;
SDValue Xor = Sub.getOperand(1);
if (Xor.getOpcode() != ISD::XOR)
return Op;
SDValue ExtendOpA = Xor.getOperand(0);
SDValue ExtendOpB = Sub.getOperand(0);
unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
if (!(ExtendOpAOpc == ExtendOpBOpc &&
(ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
return Op;
// Is the result of the right shift being truncated to the same value type as
// the original operands, OpA and OpB?
SDValue OpA = ExtendOpA.getOperand(0);
SDValue OpB = ExtendOpB.getOperand(0);
EVT OpAVT = OpA.getValueType();
assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
return Op;
// Is the XOR using a constant amount of all ones in the right hand side?
uint64_t C;
if (!isAllConstantBuildVector(Xor.getOperand(1), C))
return Op;
unsigned ElemSizeInBits = VT.getScalarSizeInBits();
APInt CAsAPInt(ElemSizeInBits, C);
if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
return Op;
SDLoc DL(Op);
bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);
return ResultURHADD;
SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
int64_t Cnt;
if (!Op.getOperand(1).getValueType().isVector())
return Op;
unsigned EltSize = VT.getScalarSizeInBits();
switch (Op.getOpcode()) {
llvm_unreachable("unexpected shift opcode");
case ISD::SHL:
if (VT.isScalableVector())
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);
if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
DAG.getConstant(Cnt, DL, MVT::i32));
DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
Op.getOperand(0), Op.getOperand(1));
case ISD::SRA:
case ISD::SRL:
if (VT.isScalableVector()) {
unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
return LowerToPredicatedOp(Op, DAG, Opc);
// Right shift immediate
if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
unsigned Opc =
(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
DAG.getConstant(Cnt, DL, MVT::i32));
// Right shift register. Note, there is not a shift right register
// instruction, but the shift left register instruction takes a signed
// value, where negative numbers specify a right shift.
unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
: Intrinsic::aarch64_neon_ushl;
// negate the shift amount
SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
SDValue NegShiftLeft =
DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
return NegShiftLeft;
return SDValue();
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode CC, bool NoNans, EVT VT,
const SDLoc &dl, SelectionDAG &DAG) {
EVT SrcVT = LHS.getValueType();
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
"function only supposed to emit natural comparisons");
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
APInt CnstBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
bool IsZero = IsCnst && (CnstBits == 0);
if (SrcVT.getVectorElementType().isFloatingPoint()) {
switch (CC) {
return SDValue();
case AArch64CC::NE: {
SDValue Fcmeq;
if (IsZero)
Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
case AArch64CC::EQ:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
case AArch64CC::GE:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
case AArch64CC::GT:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
case AArch64CC::LS:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
case AArch64CC::LT:
if (!NoNans)
return SDValue();
// If we ignore NaNs then we can use to the MI implementation.
case AArch64CC::MI:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
switch (CC) {
return SDValue();
case AArch64CC::NE: {
SDValue Cmeq;
if (IsZero)
Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
case AArch64CC::EQ:
if (IsZero)
return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
case AArch64CC::GE:
if (IsZero)
return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
case AArch64CC::GT:
if (IsZero)
return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
case AArch64CC::LE:
if (IsZero)
return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
case AArch64CC::LS:
return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
case AArch64CC::LO:
return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
case AArch64CC::LT:
if (IsZero)
return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
case AArch64CC::HI:
return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
case AArch64CC::HS:
return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getValueType().isScalableVector()) {
if (Op.getOperand(0).getValueType().isFloatingPoint())
return Op;
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
SDLoc dl(Op);
if (LHS.getValueType().getVectorElementType().isInteger()) {
assert(LHS.getValueType() == RHS.getValueType());
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
SDValue Cmp =
EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
const bool FullFP16 =
static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
// Make v4f16 (only) fcmp operations utilise vector instructions
// v8f16 support will be a litle more complicated
if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
if (LHS.getValueType().getVectorNumElements() == 4) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
DAG.ReplaceAllUsesWith(Op, NewSetcc);
CmpVT = MVT::v4i32;
} else
return SDValue();
assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
LHS.getValueType().getVectorElementType() != MVT::f128);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
AArch64CC::CondCode CC1, CC2;
bool ShouldInvert;
changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
SDValue Cmp =
EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
if (!Cmp.getNode())
return SDValue();
if (CC2 != AArch64CC::AL) {
SDValue Cmp2 =
EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
if (!Cmp2.getNode())
return SDValue();
Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
if (ShouldInvert)
Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
return Cmp;
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
SelectionDAG &DAG) {
SDValue VecOp = ScalarOp.getOperand(0);
auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
DAG.getConstant(0, DL, MVT::i64));
SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
switch (Op.getOpcode()) {
return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
llvm_unreachable("Unhandled reduction");
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
if (!Subtarget.hasLSE())
return SDValue();
// LSE has an atomic load-add instruction, but not a load-sub.
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
SDValue RHS = Op.getOperand(2);
AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
Op.getOperand(0), Op.getOperand(1), RHS,
SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
SelectionDAG &DAG) const {
auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
if (!Subtarget.hasLSE())
return SDValue();
// LSE has an atomic load-clear instruction, but not a load-and.
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
SDValue RHS = Op.getOperand(2);
AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
Op.getOperand(0), Op.getOperand(1), RHS,
SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
if (Subtarget->hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
DAG.getConstant(4, dl, MVT::i64));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
Chain =
DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
DAG.getRegisterMask(Mask), Chain.getValue(1));
// To match the actual intent better, we should read the output from X15 here
// again (instead of potentially spilling it to the stack), but rereading Size
// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
// here.
Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
DAG.getConstant(4, dl, MVT::i64));
return Chain;
AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetWindows() &&
"Only Windows alloca probing supported");
SDLoc dl(Op);
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
MaybeAlign Align =
EVT VT = Node->getValueType(0);
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
"no-stack-arg-probe")) {
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
if (Align)
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
SDValue Ops[2] = {SP, Chain};
return DAG.getMergeValues(Ops, dl);
SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
assert(VT != MVT::i64 && "Expected illegal VSCALE node");
SDLoc DL(Op);
APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
DL, VT);
/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
template <unsigned NumVecs>
static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
const CallInst &CI) {
// Retrieve EC from first vector argument.
const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
ElementCount EC = VT.getVectorElementCount();
#ifndef NDEBUG
// Check the assumption that all input vectors are the same type.
for (unsigned I = 0; I < NumVecs; ++I)
assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
"Invalid type.");
// memVT is `NumVecs * VT`.
Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
EC * NumVecs);
Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
return true;
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
auto &DL = I.getModule()->getDataLayout();
switch (Intrinsic) {
case Intrinsic::aarch64_sve_st2:
return setInfoSVEStN<2>(Info, I);
case Intrinsic::aarch64_sve_st3:
return setInfoSVEStN<3>(Info, I);
case Intrinsic::aarch64_sve_st4:
return setInfoSVEStN<4>(Info, I);
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_ld1x2:
case Intrinsic::aarch64_neon_ld1x3:
case Intrinsic::aarch64_neon_ld1x4:
case Intrinsic::aarch64_neon_ld2lane:
case Intrinsic::aarch64_neon_ld3lane:
case Intrinsic::aarch64_neon_ld4lane:
case Intrinsic::aarch64_neon_ld2r:
case Intrinsic::aarch64_neon_ld3r:
case Intrinsic::aarch64_neon_ld4r: {
// Conservatively set memVT to the entire set of vectors loaded.
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
case Intrinsic::aarch64_neon_st1x2:
case Intrinsic::aarch64_neon_st1x3:
case Intrinsic::aarch64_neon_st1x4:
case Intrinsic::aarch64_neon_st2lane:
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane: {
// Conservatively set memVT to the entire set of vectors stored.
unsigned NumElts = 0;
for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
Type *ArgTy = I.getArgOperand(ArgI)->getType();
if (!ArgTy->isVectorTy())
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::aarch64_ldaxr:
case Intrinsic::aarch64_ldxr: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::aarch64_stlxr:
case Intrinsic::aarch64_stxr: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::aarch64_ldaxp:
case Intrinsic::aarch64_ldxp:
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = Align(16);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::aarch64_stlxp:
case Intrinsic::aarch64_stxp:
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
Info.align = Align(16);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::aarch64_sve_ldnt1: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
Info.memVT = MVT::getVT(I.getType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOLoad;
if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
Info.flags |= MachineMemOperand::MONonTemporal;
return true;
case Intrinsic::aarch64_sve_stnt1: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
Info.memVT = MVT::getVT(I.getOperand(0)->getType());
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOStore;
if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
Info.flags |= MachineMemOperand::MONonTemporal;
return true;
return false;
bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
// TODO: This may be worth removing. Check regression tests for diffs.
if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
return false;
// If we're reducing the load width in order to avoid having to use an extra
// instruction to do extension then it's probably a good idea.
if (ExtTy != ISD::NON_EXTLOAD)
return true;
// Don't reduce load width if it would prevent us from combining a shift into
// the offset.
MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
const SDValue &Base = Mem->getBasePtr();
if (Base.getOpcode() == ISD::ADD &&
Base.getOperand(1).getOpcode() == ISD::SHL &&
Base.getOperand(1).hasOneUse() &&
Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
// The shift can be combined if it matches the size of the value being
// loaded (and so reducing the width would make it not match).
uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
if (ShiftAmount == Log2_32(LoadBytes))
return false;
// We have no reason to disallow reducing the load width, so allow it.
return true;
// Truncations from 64-bit GPR to 32-bit GPR is free.
bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 > NumBits2;
bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 > NumBits2;
/// Check if it is profitable to hoist instruction in then/else to if.
/// Not profitable if I and it's user can form a FMA instruction
/// because we prefer FMSUB/FMADD.
bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
if (I->getOpcode() != Instruction::FMul)
return true;
if (!I->hasOneUse())
return true;
Instruction *User = I->user_back();
if (User &&
!(User->getOpcode() == Instruction::FSub ||
User->getOpcode() == Instruction::FAdd))
return true;
const TargetOptions &Options = getTargetMachine().Options;
const Function *F = I->getFunction();
const DataLayout &DL = F->getParent()->getDataLayout();
Type *Ty = User->getOperand(0)->getType();
return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast ||
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
// 64-bit GPR.
bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 == 32 && NumBits2 == 64;
bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 == 32 && NumBits2 == 64;
bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
EVT VT1 = Val.getValueType();
if (isZExtFree(VT1, VT2)) {
return true;
if (Val.getOpcode() != ISD::LOAD)
return false;
// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
VT1.getSizeInBits() <= 32);
bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
if (isa<FPExtInst>(Ext))
return false;
// Vector types are not free.
if (Ext->getType()->isVectorTy())
return false;
for (const Use &U : Ext->uses()) {
// The extension is free if we can fold it with a left shift in an
// addressing mode or an arithmetic operation: add, sub, and cmp.
// Is there a shift?
const Instruction *Instr = cast<Instruction>(U.getUser());
// Is this a constant shift?
switch (Instr->getOpcode()) {
case Instruction::Shl:
if (!isa<ConstantInt>(Instr->getOperand(1)))
return false;
case Instruction::GetElementPtr: {
gep_type_iterator GTI = gep_type_begin(Instr);
auto &DL = Ext->getModule()->getDataLayout();
std::advance(GTI, U.getOperandNo()-1);
Type *IdxTy = GTI.getIndexedType();
// This extension will end up with a shift because of the scaling factor.
// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
// Get the shift amount based on the scaling factor:
// log2(sizeof(IdxTy)) - log2(8).
uint64_t ShiftAmt =
countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
// Is the constant foldable in the shift of the addressing mode?
// I.e., shift amount is between 1 and 4 inclusive.
if (ShiftAmt == 0 || ShiftAmt > 4)
return false;
case Instruction::Trunc:
// Check if this is a noop.
// trunc(sext ty1 to ty2) to ty1.
if (Instr->getType() == Ext->getOperand(0)->getType())
return false;
// At this point we can use the bfm family, so this extension is free
// for that use.
return true;
/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
/// or upper half of the vector elements.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
auto *FullTy = FullV->getType();
auto *HalfTy = HalfV->getType();
return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
auto extractHalf = [](Value *FullV, Value *HalfV) {
auto *FullVT = cast<FixedVectorType>(FullV->getType());
auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
ArrayRef<int> M1, M2;
Value *S1Op1, *S2Op1;
if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
!match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
return false;
// Check that the operands are half as wide as the result and we extract
// half of the elements of the input vectors.
if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
!extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
return false;
// Check the mask extracts either the lower or upper half of vector
// elements.
int M1Start = -1;
int M2Start = -1;
int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
return false;
return true;
/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
/// of the vector elements.
static bool areExtractExts(Value *Ext1, Value *Ext2) {
auto areExtDoubled = [](Instruction *Ext) {
return Ext->getType()->getScalarSizeInBits() ==
2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
!match(Ext2, m_ZExtOrSExt(m_Value())) ||
!areExtDoubled(cast<Instruction>(Ext1)) ||
return false;
return true;
/// Check if Op could be used with vmull_high_p64 intrinsic.
static bool isOperandOfVmullHighP64(Value *Op) {
Value *VectorOperand = nullptr;
ConstantInt *ElementIndex = nullptr;
return match(Op, m_ExtractElt(m_Value(VectorOperand),
m_ConstantInt(ElementIndex))) &&
ElementIndex->getValue() == 1 &&
isa<FixedVectorType>(VectorOperand->getType()) &&
cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
bool AArch64TargetLowering::shouldSinkOperands(
Instruction *I, SmallVectorImpl<Use *> &Ops) const {
if (!I->getType()->isVectorTy())
return false;
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
case Intrinsic::aarch64_neon_umull:
if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
return false;
return true;
case Intrinsic::aarch64_neon_pmull64:
if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
return false;
return true;
return false;
switch (I->getOpcode()) {
case Instruction::Sub:
case Instruction::Add: {
if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
return false;
// If the exts' operands extract either the lower or upper elements, we
// can sink them too.
auto Ext1 = cast<Instruction>(I->getOperand(0));
auto Ext2 = cast<Instruction>(I->getOperand(1));
if (areExtractShuffleVectors(Ext1, Ext2)) {
return true;
return false;
return false;
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
Align &RequiredAligment) const {
if (!LoadedType.isSimple() ||
(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
return false;
// Cyclone supports unaligned accesses.
RequiredAligment = Align(1);
unsigned NumBits = LoadedType.getSizeInBits();
return NumBits == 32 || NumBits == 64;
/// A helper function for determining the number of interleaved accesses we
/// will generate when lowering accesses of the given type.
AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
const DataLayout &DL) const {
return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
return MOStridedAccess;
return MachineMemOperand::MONone;
bool AArch64TargetLowering::isLegalInterleavedAccessType(
VectorType *VecTy, const DataLayout &DL) const {
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
// Ensure the number of vector elements is greater than 1.
if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
return false;
// Ensure the element type is legal.
if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
return false;
// Ensure the total vector size is 64 or a multiple of 128. Types larger than
// 128 will be split into multiple interleaved accesses.
return VecSize == 64 || VecSize % 128 == 0;
/// Lower an interleaved load into a ldN intrinsic.
/// E.g. Lower an interleaved load (Factor = 2):
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
/// Into:
/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
const DataLayout &DL = LI->getModule()->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
return false;
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
auto *FVTy = cast<FixedVectorType>(VTy);
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
Type *EltTy = FVTy->getElementType();
if (EltTy->isPointerTy())
FVTy =
FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
IRBuilder<> Builder(LI);
// The base address of the load.
Value *BaseAddr = LI->getPointerOperand();
if (NumLoads > 1) {
// If we're going to generate more than one load, reset the sub-vector type
// to something legal.
FVTy = FixedVectorType::get(FVTy->getElementType(),
FVTy->getNumElements() / NumLoads);
// We will compute the pointer operand of each load from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
Type *Tys[2] = {FVTy, PtrTy};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
Function *LdNFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
// Holds sub-vectors extracted from the load intrinsic return values. The
// sub-vectors are associated with the shufflevector instructions they will
// replace.
DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
FVTy->getNumElements() * Factor);
CallInst *LdN = Builder.CreateCall(
LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
// Extract and store the sub-vectors returned by the load intrinsic.
for (unsigned i = 0; i < Shuffles.size(); i++) {
ShuffleVectorInst *SVI = Shuffles[i];
unsigned Index = Indices[i];
Value *SubVec = Builder.CreateExtractValue(LdN, Index);
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(
SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
// Replace uses of the shufflevector instructions with the sub-vectors
// returned by the load intrinsic. If a shufflevector instruction is
// associated with more than one sub-vector, those sub-vectors will be
// concatenated into a single wide vector.
for (ShuffleVectorInst *SVI : Shuffles) {
auto &SubVec = SubVecs[SVI];
auto *WideVec =
SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
return true;
/// Lower an interleaved store into a stN intrinsic.
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
/// Into:
/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
/// Note that the new shufflevectors will be removed and we'll only generate one
/// st3 instruction in CodeGen.
/// Example for a more general valid mask (Factor 3). Lower:
/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
/// Into:
/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
unsigned LaneLen = VecTy->getNumElements() / Factor;
Type *EltTy = VecTy->getElementType();
auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
// Skip if we do not have NEON and skip illegal vector types. We can
// "legalize" wide vector types into multiple interleaved accesses as long as
// the vector types are divisible by 128.
if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
return false;
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1);
IRBuilder<> Builder(SI);
// StN intrinsics don't support pointer vectors as arguments. Convert pointer
// vectors to integer vectors.
if (EltTy->isPointerTy()) {
Type *IntTy = DL.getIntPtrType(EltTy);
unsigned NumOpElts =
// Convert to the corresponding integer vector.
auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
SubVecTy = FixedVectorType::get(IntTy, LaneLen);
// The base address of the store.
Value *BaseAddr = SI->getPointerOperand();
if (NumStores > 1) {
// If we're going to generate more than one store, reset the lane length
// and sub-vector type to something legal.
LaneLen /= NumStores;
SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
// We will compute the pointer operand of each store from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
auto Mask = SVI->getShuffleMask();
Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
Type *Tys[2] = {SubVecTy, PtrTy};
static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
Function *StNFunc =
Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
SmallVector<Value *, 5> Ops;
// Split the shufflevector operands into sub vectors for the new stN call.
for (unsigned i = 0; i < Factor; i++) {
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
unsigned IdxJ = StoreCount * LaneLen * Factor + j;
if (Mask[IdxJ * Factor + IdxI] >= 0) {
StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
// Note: Filling undef gaps with random elements is ok, since
// those elements were being written anyway (with undefs).
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
BaseAddr, LaneLen * Factor);
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
Builder.CreateCall(StNFunc, Ops);
return true;
// Lower an SVE structured load intrinsic returning a tuple type to target
// specific intrinsic taking the same input but returning a multi-result value
// of the split tuple type.
// E.g. Lowering an LD3:
// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
// <vscale x 4 x i1> %pred,
// <vscale x 4 x i32>* %addr)
// Output DAG:
// t0: ch = EntryToken
// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
// t4: i64,ch = CopyFromReg t0, Register:i64 %1
// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
// This is called pre-legalization to avoid widening/splitting issues with
// non-power-of-2 tuple types used for LD3, such as nxv12i32.
SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
ArrayRef<SDValue> LoadOps,
EVT VT, SelectionDAG &DAG,
const SDLoc &DL) const {
assert(VT.isScalableVector() && "Can only lower scalable vectors");
unsigned N, Opcode;
static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
{Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
{Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
{Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
assert(VT.getVectorElementCount().Min % N == 0 &&
"invalid tuple vector type!");
EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
VT.getVectorElementCount() / N);
SmallVector<EVT, 5> VTs(N, SplitVT);
VTs.push_back(MVT::Other); // Chain
SDVTList NodeTys = DAG.getVTList(VTs);
SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
SmallVector<SDValue, 4> PseudoLoadOps;
for (unsigned I = 0; I < N; ++I)
PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
EVT AArch64TargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {
bool CanImplicitFloat =
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
if (Op.isAligned(AlignCheck))
return true;
bool Fast;
return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
&Fast) &&
if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
AlignmentIsAcceptable(MVT::v2i64, Align(16)))
return MVT::v2i64;
if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
return MVT::f128;
if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
return MVT::i64;
if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
return MVT::i32;
return MVT::Other;
LLT AArch64TargetLowering::getOptimalMemOpLLT(
const MemOp &Op, const AttributeList &FuncAttributes) const {
bool CanImplicitFloat =
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
if (Op.isAligned(AlignCheck))
return true;
bool Fast;
return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
&Fast) &&
if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
AlignmentIsAcceptable(MVT::v2i64, Align(16)))
return LLT::vector(2, 64);
if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
return LLT::scalar(128);
if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
return LLT::scalar(64);
if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
return LLT::scalar(32);
return LLT();
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
if (Immed == std::numeric_limits<int64_t>::min()) {
LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
<< ": avoid UB for INT64_MIN\n");
return false;
// Same encoding for add/sub, just flip the sign.
Immed = std::abs(Immed);
bool IsLegal = ((Immed >> 12) == 0 ||
((Immed & 0xfff) == 0 && Immed >> 24 == 0));
LLVM_DEBUG(dbgs() << "Is " << Immed
<< " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
return IsLegal;
// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
// immediates is the same as for an add or a sub.
bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
return isLegalAddImmediate(Immed);
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS, Instruction *I) const {
// AArch64 has five basic addressing modes:
// reg
// reg + 9-bit signed offset
// reg + SIZE_IN_BYTES * 12-bit unsigned offset
// reg1 + reg2
// reg + SIZE_IN_BYTES * reg
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
// No reg+reg+imm addressing.
if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
return false;
// FIXME: Update this method to support scalable addressing modes.
if (isa<ScalableVectorType>(Ty))
return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
// check reg + imm case:
// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
uint64_t NumBytes = 0;
if (Ty->isSized()) {
uint64_t NumBits = DL.getTypeSizeInBits(Ty);
NumBytes = NumBits / 8;
if (!isPowerOf2_64(NumBits))
NumBytes = 0;
if (!AM.Scale) {
int64_t Offset = AM.BaseOffs;
// 9-bit signed offset
if (isInt<9>(Offset))
return true;
// 12-bit unsigned offset
unsigned shift = Log2_64(NumBytes);
if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
// Must be a multiple of NumBytes (NumBytes is a power of 2)
(Offset >> shift) << shift == Offset)
return true;
return false;
// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
// Consider splitting large offset of struct or array.
return true;
int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
// Scaling factors are not free at all.
// Operands | Rt Latency
// -------------------------------------------
// Rt, [Xn, Xm] | 4
// -------------------------------------------
// Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
// Rt, [Xn, Wm, <extend> #imm] |
if (isLegalAddressingMode(DL, AM, Ty, AS))
// Scale represents reg2 * scale, thus account for 1 if
// it is not equal to 0 or 1.
return AM.Scale != 0 && AM.Scale != 1;
return -1;
bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
const MachineFunction &MF, EVT VT) const {
VT = VT.getScalarType();
if (!VT.isSimple())
return false;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32:
case MVT::f64:
return true;
return false;
bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
Type *Ty) const {
switch (Ty->getScalarType()->getTypeID()) {
case Type::FloatTyID:
case Type::DoubleTyID:
return true;
return false;
const MCPhysReg *
AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
// LR is a callee-save register, but we must treat it as clobbered by any call
// site. Hence we include LR in the scratch registers, which are in turn added
// as implicit-defs for stackmaps and patchpoints.
static const MCPhysReg ScratchRegs[] = {
AArch64::X16, AArch64::X17, AArch64::LR, 0
return ScratchRegs;
AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const {
N = N->getOperand(0).getNode();
EVT VT = N->getValueType(0);
// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
// it with shift to let it be lowered to UBFX.
if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
isa<ConstantSDNode>(N->getOperand(1))) {
uint64_t TruncMask = N->getConstantOperandVal(1);
if (isMask_64(TruncMask) &&
N->getOperand(0).getOpcode() == ISD::SRL &&
return false;
return true;
bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0)
return false;
int64_t Val = Imm.getSExtValue();
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
return true;
if ((int64_t)Val < 0)
Val = ~Val;
if (BitSize == 32)
Val &= (1LL << 32) - 1;
unsigned LZ = countLeadingZeros((uint64_t)Val);
unsigned Shift = (63 - LZ) / 16;
// MOVZ is free so return true for one or fewer MOVK.
return Shift < 3;
bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
return (Index == 0 || Index == ResVT.getVectorNumElements());
/// Turn vector tests of the signbit in the form of:
/// xor (sra X, elt_size(X)-1), -1
/// into:
/// cmge X, X, #0
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
if (!Subtarget->hasNEON() || !VT.isVector())
return SDValue();
// There must be a shift right algebraic before the xor, and the xor must be a
// 'not' operation.
SDValue Shift = N->getOperand(0);
SDValue Ones = N->getOperand(1);
if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
return SDValue();
// The shift should be smearing the sign bit across each vector element.
auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
return SDValue();
return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
// Generate SUBS and CSEL for integer abs.
static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
// and change it to SUB and CSEL.
if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
// Generate SUBS & CSEL.
SDValue Cmp =
DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
N0.getOperand(0), DAG.getConstant(0, DL, VT));
return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
SDValue(Cmp.getNode(), 1));
return SDValue();
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
return Cmp;
return performIntegerAbsCombine(N, DAG);
AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const {
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (isIntDivCheap(N->getValueType(0), Attr))
return SDValue(N,0); // Lower SDIV as SDIV
// fold (sdiv X, pow2)
EVT VT = N->getValueType(0);
if ((VT != MVT::i32 && VT != MVT::i64) ||
!(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
return SDValue();
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
unsigned Lg2 = Divisor.countTrailingZeros();
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
// Add (N0 < 0) ? Pow2 - 1 : 0;
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
// Divide by pow2.
SDValue SRA =
DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
// If we're dividing by a positive value, we're done. Otherwise, we must
// negate the result.
if (Divisor.isNonNegative())
return SRA;
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
static bool IsSVECntIntrinsic(SDValue S) {
switch(getIntrinsicID(S.getNode())) {
case Intrinsic::aarch64_sve_cntb:
case Intrinsic::aarch64_sve_cnth:
case Intrinsic::aarch64_sve_cntw:
case Intrinsic::aarch64_sve_cntd:
return true;
return false;
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
// The below optimizations require a constant RHS.
if (!isa<ConstantSDNode>(N->getOperand(1)))
return SDValue();
SDValue N0 = N->getOperand(0);
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
const APInt &ConstValue = C->getAPIntValue();
// Allow the scaling to be folded into the `cnt` instruction by preventing
// the scaling to be obscured here. This makes it easier to pattern match.
if (IsSVECntIntrinsic(N0) ||
(N0->getOpcode() == ISD::TRUNCATE &&
if (ConstValue.sge(1) && ConstValue.sle(16))
return SDValue();
// Multiplication of a power of two plus/minus one can be done more
// cheaply as as shift+add/sub. For now, this is true unilaterally. If
// future CPUs have a cheaper MADD instruction, this may need to be
// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
// 64-bit is 5 cycles, so this is always a win.
// More aggressively, some multiplications N0 * C can be lowered to
// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
// e.g. 6=3*2=(2+1)*2.
// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
// which equals to (1+2)*16-(1+2).
// TrailingZeroes is used to test if the mul can be lowered to
// shift+add+shift.
unsigned TrailingZeroes = ConstValue.countTrailingZeros();
if (TrailingZeroes) {
// Conservatively do not lower to shift+add+shift if the mul might be
// folded into smul or umul.
if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
isZeroExtended(N0.getNode(), DAG)))
return SDValue();
// Conservatively do not lower to shift+add+shift if the mul might be
// folded into madd or msub.
if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
N->use_begin()->getOpcode() == ISD::SUB))
return SDValue();
// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
// and shift+add+shift.
APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
unsigned ShiftAmt, AddSubOpc;
// Is the shifted value the LHS operand of the add/sub?
bool ShiftValUseIsN0 = true;
// Do we need to negate the result?
bool NegateResult = false;
if (ConstValue.isNonNegative()) {
// (mul x, 2^N + 1) => (add (shl x, N), x)
// (mul x, 2^N - 1) => (sub (shl x, N), x)
// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
APInt SCVMinus1 = ShiftedConstValue - 1;
APInt CVPlus1 = ConstValue + 1;
if (SCVMinus1.isPowerOf2()) {
ShiftAmt = SCVMinus1.logBase2();
AddSubOpc = ISD::ADD;
} else if (CVPlus1.isPowerOf2()) {
ShiftAmt = CVPlus1.logBase2();
AddSubOpc = ISD::SUB;
} else
return SDValue();
} else {
// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
APInt CVNegPlus1 = -ConstValue + 1;
APInt CVNegMinus1 = -ConstValue - 1;
if (CVNegPlus1.isPowerOf2()) {
ShiftAmt = CVNegPlus1.logBase2();
AddSubOpc = ISD::SUB;
ShiftValUseIsN0 = false;
} else if (CVNegMinus1.isPowerOf2()) {
ShiftAmt = CVNegMinus1.logBase2();
AddSubOpc = ISD::ADD;
NegateResult = true;
} else
return SDValue();
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
DAG.getConstant(ShiftAmt, DL, MVT::i64));
SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
assert(!(NegateResult && TrailingZeroes) &&
"NegateResult and TrailingZeroes cannot both be true for now.");
// Negate the result.
if (NegateResult)
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
// Shift the result.
if (TrailingZeroes)
return DAG.getNode(ISD::SHL, DL, VT, Res,
DAG.getConstant(TrailingZeroes, DL, MVT::i64));
return Res;
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
SelectionDAG &DAG) {
// Take advantage of vector comparisons producing 0 or -1 in each lane to
// optimize away operation when it's from a constant.
// The general transformation is:
// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
// AND(VECTOR_CMP(x,y), constant2)
// constant2 = UNARYOP(constant)
// Early exit if this isn't a vector operation, the operand of the
// unary operation isn't a bitwise AND, or if the sizes of the operations
// aren't the same.
EVT VT = N->getValueType(0);
if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
return SDValue();
// Now check that the other operand of the AND is a constant. We could
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
if (BuildVectorSDNode *BV =
dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
// Everything checks out. Build up the new and improved node.
SDLoc DL(N);
EVT IntVT = BV->getValueType(0);
// Create a new constant of the appropriate type for the transformed
// DAG.
SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
N->getOperand(0)->getOperand(0), MaskConst);
SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
return Res;
return SDValue();
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
return Res;
EVT VT = N->getValueType(0);
if (VT != MVT::f32 && VT != MVT::f64)
return SDValue();
// Only optimize when the source and destination types have the same width.
if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
return SDValue();
// If the result of an integer load is only used by an integer-to-float
// conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
// This eliminates an "integer-to-vector-move" UOP and improves throughput.
SDValue N0 = N->getOperand(0);
if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not change the width of a volatile load.
!cast<LoadSDNode>(N0)->isVolatile()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
LN0->getPointerInfo(), LN0->getAlignment(),
// Make sure successors of the original load stay after it by updating them
// to use the new Chain.
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
unsigned Opcode =
(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
return DAG.getNode(Opcode, SDLoc(N), VT, Load);
return SDValue();
/// Fold a floating-point multiply by power of two into floating-point to
/// fixed-point conversion.
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
if (!N->getValueType(0).isSimple())
return SDValue();
SDValue Op = N->getOperand(0);
if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
Op.getOpcode() != ISD::FMUL)
return SDValue();
SDValue ConstVec = Op->getOperand(1);
if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
uint32_t FloatBits = FloatTy.getSizeInBits();
if (FloatBits != 32 && FloatBits != 64)
return SDValue();
MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
if (IntBits != 16 && IntBits != 32 && IntBits != 64)
return SDValue();
// Avoid conversions where iN is larger than the float (e.g., float -> i64).
if (IntBits > FloatBits)
return SDValue();
BitVector UndefElements;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
int32_t Bits = IntBits == 64 ? 64 : 32;
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
if (C == -1 || C == 0 || C > Bits)
return SDValue();
MVT ResTy;
unsigned NumLanes = Op.getValueType().getVectorNumElements();
switch (NumLanes) {
return SDValue();
case 2:
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
case 4:
ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
return SDValue();
assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
"Illegal vector type after legalization");
SDLoc DL(N);
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
: Intrinsic::aarch64_neon_vcvtfp2fxu;
SDValue FixConv =
DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
// We can handle smaller integers by generating an extra trunc.
if (IntBits < FloatBits)
FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
return FixConv;
/// Fold a floating-point divide by power of two into fixed-point to
/// floating-point conversion.
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue Op = N->getOperand(0);
unsigned Opc = Op->getOpcode();
if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
!Op.getOperand(0).getValueType().isSimple() ||
(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
return SDValue();
SDValue ConstVec = N->getOperand(1);
if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
int32_t IntBits = IntTy.getSizeInBits();
if (IntBits != 16 && IntBits != 32 && IntBits != 64)
return SDValue();
MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
int32_t FloatBits = FloatTy.getSizeInBits();
if (FloatBits != 32 && FloatBits != 64)
return SDValue();
// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
if (IntBits > FloatBits)
return SDValue();
BitVector UndefElements;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
if (C == -1 || C == 0 || C > FloatBits)
return SDValue();
MVT ResTy;
unsigned NumLanes = Op.getValueType().getVectorNumElements();
switch (NumLanes) {
return SDValue();
case 2:
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
case 4:
ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
return SDValue();
SDLoc DL(N);
SDValue ConvInput = Op.getOperand(0);
bool IsSigned = Opc == ISD::SINT_TO_FP;
if (IntBits < FloatBits)
ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
ResTy, ConvInput);
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
: Intrinsic::aarch64_neon_vcvtfxu2fp;
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
DAG.getConstant(C, DL, MVT::i32));
/// An EXTR instruction is made up of two shifts, ORed together. This helper
/// searches for and classifies those shifts.
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
bool &FromHi) {
if (N.getOpcode() == ISD::SHL)
FromHi = false;
else if (N.getOpcode() == ISD::SRL)
FromHi = true;
return false;
if (!isa<ConstantSDNode>(N.getOperand(1)))
return false;
ShiftAmount = N->getConstantOperandVal(1);
Src = N->getOperand(0);
return true;
/// EXTR instruction extracts a contiguous chunk of bits from two existing
/// registers viewed as a high/low pair. This function looks for the pattern:
/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
/// with an EXTR. Can't quite be done in TableGen because the two immediates
/// aren't independent.
static SDValue tryCombineToEXTR(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
EVT VT = N->getValueType(0);
assert(N->getOpcode() == ISD::OR && "Unexpected root");
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
SDValue LHS;
uint32_t ShiftLHS = 0;
bool LHSFromHi = false;
if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
return SDValue();
SDValue RHS;
uint32_t ShiftRHS = 0;
bool RHSFromHi = false;
if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
return SDValue();
// If they're both trying to come from the high part of the register, they're
// not really an EXTR.
if (LHSFromHi == RHSFromHi)
return SDValue();
if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
return SDValue();
if (LHSFromHi) {
std::swap(LHS, RHS);
std::swap(ShiftLHS, ShiftRHS);
return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
DAG.getConstant(ShiftRHS, DL, MVT::i64));
static SDValue tryCombineToBSL(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
if (!VT.isVector())
return SDValue();
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() != ISD::AND)
return SDValue();
SDValue N1 = N->getOperand(1);
if (N1.getOpcode() != ISD::AND)
return SDValue();
// We only have to look for constant vectors here since the general, variable
// case can be handled in TableGen.
unsigned Bits = VT.getScalarSizeInBits();
uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
for (int i = 1; i >= 0; --i)
for (int j = 1; j >= 0; --j) {
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
if (!BVN0 || !BVN1)
bool FoundMatch = true;
for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
if (!CN0 || !CN1 ||
CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
FoundMatch = false;
if (FoundMatch)
return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
N0->getOperand(1 - i), N1->getOperand(1 - j));
return SDValue();
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
if (SDValue Res = tryCombineToEXTR(N, DCI))
return Res;
if (SDValue Res = tryCombineToBSL(N, DCI))
return Res;
return SDValue();
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
if (!MemVT.getVectorElementType().isSimple())
return false;
uint64_t MaskForTy = 0ull;
switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
case MVT::i8:
MaskForTy = 0xffull;
case MVT::i16:
MaskForTy = 0xffffull;
case MVT::i32:
MaskForTy = 0xffffffffull;
return false;
if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
return false;
static SDValue performSVEAndCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDValue Src = N->getOperand(0);
unsigned Opc = Src->getOpcode();
// Zero/any extend of an unsigned unpack
if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
SDValue UnpkOp = Src->getOperand(0);
SDValue Dup = N->getOperand(1);
if (Dup.getOpcode() != AArch64ISD::DUP)
return SDValue();
SDLoc DL(N);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
uint64_t ExtVal = C->getZExtValue();
// If the mask is fully covered by the unpack, we don't need to push
// a new AND onto the operand
EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
(ExtVal == 0xFFFF && EltTy == MVT::i16) ||
(ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
return Src;
// Truncate to prevent a DUP with an over wide constant
APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
// Otherwise, make sure we propagate the AND to the operand
// of the unpack
Dup = DAG.getNode(AArch64ISD::DUP, DL,
DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
SDValue And = DAG.getNode(ISD::AND, DL,
UnpkOp->getValueType(0), UnpkOp, Dup);
return DAG.getNode(Opc, DL, N->getValueType(0), And);
SDValue Mask = N->getOperand(1);
if (!Src.hasOneUse())
return SDValue();
// SVE load instructions perform an implicit zero-extend, which makes them
// perfect candidates for combining.
switch (Opc) {
case AArch64ISD::LD1_MERGE_ZERO:
MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
return SDValue();
if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
return Src;
return SDValue();
static SDValue performANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDValue LHS = N->getOperand(0);
EVT VT = N->getValueType(0);
if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
if (VT.isScalableVector())
return performSVEAndCombine(N, DCI);
BuildVectorSDNode *BVN =
if (!BVN)
return SDValue();
// AND does not accept an immediate, so check if we can use a BIC immediate
// instruction instead. We do this here instead of using a (and x, (mvni imm))
// pattern in isel, because some immediates may be lowered to the preferred
// (and x, (movi imm)) form, even though an mvni representation also exists.
APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
if (resolveBuildVector(BVN, DefBits, UndefBits)) {
SDValue NewOp;
DefBits = ~DefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
DefBits, &LHS)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
DefBits, &LHS)))
return NewOp;
UndefBits = ~UndefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
UndefBits, &LHS)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
UndefBits, &LHS)))
return NewOp;
return SDValue();
static SDValue performSRLCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() == ISD::BSWAP) {
SDLoc DL(N);
SDValue N1 = N->getOperand(1);
SDValue N00 = N0.getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
uint64_t ShiftAmt = C->getZExtValue();
if (VT == MVT::i32 && ShiftAmt == 16 &&
DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
if (VT == MVT::i64 && ShiftAmt == 32 &&
DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
return SDValue();
static SDValue performConcatVectorsCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
// Optimize concat_vectors of truncated vectors, where the intermediate
// type is illegal, to avoid said illegality, e.g.,
// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
// (v2i16 (truncate (v2i64)))))
// ->
// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
// (v4i32 (bitcast (v2i64))),
// <0, 2, 4, 6>)))
// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
// on both input and result type, so we might generate worse code.
// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
SDValue N00 = N0->getOperand(0);
SDValue N10 = N1->getOperand(0);
EVT N00VT = N00.getValueType();
if (N00VT == N10.getValueType() &&
(N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
for (size_t i = 0; i < Mask.size(); ++i)
Mask[i] = i * 2;
return DAG.getNode(ISD::TRUNCATE, dl, VT,
MidVT, dl,
DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
// Optimise concat_vectors of two [us]rhadds that use extracted subvectors
// from the same original vectors. Combine these into a single [us]rhadd that
// operates on the two original vectors. Example:
// (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
// extract_subvector (v16i8 OpB,
// <0>))),
// (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
// extract_subvector (v16i8 OpB,
// <8>)))))
// ->
// (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
(N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) {
SDValue N00 = N0->getOperand(0);
SDValue N01 = N0->getOperand(1);
SDValue N10 = N1->getOperand(0);
SDValue N11 = N1->getOperand(1);
EVT N00VT = N00.getValueType();
EVT N10VT = N10.getValueType();
if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
SDValue N00Source = N00->getOperand(0);
SDValue N01Source = N01->getOperand(0);
SDValue N10Source = N10->getOperand(0);
SDValue N11Source = N11->getOperand(0);
if (N00Source == N10Source && N01Source == N11Source &&
N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
assert(N0.getValueType() == N1.getValueType());
uint64_t N00Index = N00.getConstantOperandVal(1);
uint64_t N01Index = N01.getConstantOperandVal(1);
uint64_t N10Index = N10.getConstantOperandVal(1);
uint64_t N11Index = N11.getConstantOperandVal(1);
if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
N10Index == N00VT.getVectorNumElements())
return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
// splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that.
if (N0 == N1 && VT.getVectorNumElements() == 2) {
assert(VT.getScalarSizeInBits() == 64);
return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
DAG.getConstant(0, dl, MVT::i64));
// Canonicalise concat_vectors so that the right-hand vector has as few
// bit-casts as possible before its real operation. The primary matching
// destination for these operations will be the narrowing "2" instructions,
// which depend on the operation being performed on this right-hand vector.
// For example,
// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
// becomes
// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
if (N1Opc != ISD::BITCAST)
return SDValue();
SDValue RHS = N1->getOperand(0);
MVT RHSTy = RHS.getValueType().getSimpleVT();
// If the RHS is not a vector, this is not the pattern we're looking for.
if (!RHSTy.isVector())
return SDValue();
dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
RHSTy.getVectorNumElements() * 2);
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
static SDValue tryCombineFixedPointConvert(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// Wait until after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
// Transform a scalar conversion of a value from a lane extract into a
// lane extract of a vector conversion. E.g., from foo1 to foo2:
// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
// The second form interacts better with instruction selection and the
// register allocator to avoid cross-class register copies that aren't
// coalescable due to a lane reference.
// Check the operand and see if it originates from a lane extract.
SDValue Op1 = N->getOperand(1);
if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
// Yep, no additional predication needed. Perform the transform.
SDValue IID = N->getOperand(0);
SDValue Shift = N->getOperand(2);
SDValue Vec = Op1.getOperand(0);
SDValue Lane = Op1.getOperand(1);
EVT ResTy = N->getValueType(0);
EVT VecResTy;
SDLoc DL(N);
// The vector width should be 128 bits by the time we get here, even
// if it started as 64 bits (the extract_vector handling will have
// done so).
assert(Vec.getValueSizeInBits() == 128 &&
"unexpected vector size on extract_vector_elt!");
if (Vec.getValueType() == MVT::v4i32)
VecResTy = MVT::v4f32;
else if (Vec.getValueType() == MVT::v2i64)
VecResTy = MVT::v2f64;
llvm_unreachable("unexpected vector type!");
SDValue Convert =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
return SDValue();
// AArch64 high-vector "long" operations are formed by performing the non-high
// version on an extract_subvector of each operand which gets the high half:
// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
// However, there are cases which don't have an extract_high explicitly, but
// have another operation that can be made compatible with one for free. For
// example:
// (dupv64 scalar) --> (extract_high (dup128 scalar))
// This routine does the actual conversion of such DUPs, once outer routines
// have determined that everything else is in order.
// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
// similarly here.
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
switch (N.getOpcode()) {
case AArch64ISD::DUP:
case AArch64ISD::DUPLANE8:
case AArch64ISD::DUPLANE16:
case AArch64ISD::DUPLANE32:
case AArch64ISD::DUPLANE64:
case AArch64ISD::MOVI:
case AArch64ISD::MOVIshift:
case AArch64ISD::MOVIedit:
case AArch64ISD::MOVImsl:
case AArch64ISD::MVNIshift:
case AArch64ISD::MVNImsl:
// FMOV could be supported, but isn't very useful, as it would only occur
// if you passed a bitcast' floating point immediate to an eligible long
// integer op (addl, smull, ...).
return SDValue();
MVT NarrowTy = N.getSimpleValueType();
if (!NarrowTy.is64BitVector())
return SDValue();
MVT ElementTy = NarrowTy.getVectorElementType();
unsigned NumElems = NarrowTy.getVectorNumElements();
MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
SDLoc dl(N);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
DAG.getConstant(NumElems, dl, MVT::i64));
static bool isEssentiallyExtractHighSubvector(SDValue N) {
if (N.getOpcode() == ISD::BITCAST)
N = N.getOperand(0);
if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;
return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
N.getOperand(0).getValueType().getVectorNumElements() / 2;
/// Helper structure to keep track of ISD::SET_CC operands.
struct GenericSetCCInfo {
const SDValue *Opnd0;
const SDValue *Opnd1;
ISD::CondCode CC;
/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
struct AArch64SetCCInfo {
const SDValue *Cmp;
AArch64CC::CondCode CC;
/// Helper structure to keep track of SetCC information.
union SetCCInfo {
GenericSetCCInfo Generic;
AArch64SetCCInfo AArch64;
/// Helper structure to be able to read SetCC information. If set to
/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
/// GenericSetCCInfo.
struct SetCCInfoAndKind {
SetCCInfo Info;
bool IsAArch64;
/// Check whether or not \p Op is a SET_CC operation, either a generic or
/// an
/// AArch64 lowered one.
/// \p SetCCInfo is filled accordingly.
/// \post SetCCInfo is meanginfull only when this function returns true.
/// \return True when Op is a kind of SET_CC operation.
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
// If this is a setcc, this is straight forward.
if (Op.getOpcode() == ISD::SETCC) {
SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SetCCInfo.IsAArch64 = false;
return true;
// Otherwise, check if this is a matching csel instruction.
// In other words:
// - csel 1, 0, cc
// - csel 0, 1, !cc
if (Op.getOpcode() != AArch64ISD::CSEL)
return false;
// Set the information about the operands.
// TODO: we want the operands of the Cmp not the csel
SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
SetCCInfo.IsAArch64 = true;
SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
// Check that the operands matches the constraints:
// (1) Both operands must be constants.
// (2) One must be 1 and the other must be 0.
ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
// Check (1).
if (!TValue || !FValue)
return false;
// Check (2).
if (!TValue->isOne()) {
// Update the comparison when we are interested in !cc.
std::swap(TValue, FValue);
SetCCInfo.Info.AArch64.CC =
return TValue->isOne() && FValue->isNullValue();
// Returns true if Op is setcc or zext of setcc.
static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
if (isSetCC(Op, Info))
return true;
return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
isSetCC(Op->getOperand(0), Info));
// The folding we want to perform is:
// (add x, [zext] (setcc cc ...) )
// -->
// (csel x, (add x, 1), !cc ...)
// The latter will get matched to a CSINC instruction.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
SDValue LHS = Op->getOperand(0);
SDValue RHS = Op->getOperand(1);
SetCCInfoAndKind InfoAndKind;
// If neither operand is a SET_CC, give up.
if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
std::swap(LHS, RHS);
if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
return SDValue();
// FIXME: This could be generatized to work for FP comparisons.
EVT CmpVT = InfoAndKind.IsAArch64
? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
: InfoAndKind.Info.Generic.Opnd0->getValueType();
if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
return SDValue();
SDValue CCVal;
SDValue Cmp;
SDLoc dl(Op);
if (InfoAndKind.IsAArch64) {
CCVal = DAG.getConstant(
AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
Cmp = *InfoAndKind.Info.AArch64.Cmp;
} else
Cmp = getAArch64Cmp(
*InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
EVT VT = Op->getValueType(0);
LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
// The basic add/sub long vector instructions have variants with "2" on the end
// which act on the high-half of their inputs. They are normally matched by
// patterns like:
// (add (zeroext (extract_high LHS)),
// (zeroext (extract_high RHS)))
// -> uaddl2 vD, vN, vM
// However, if one of the extracts is something like a duplicate, this
// instruction can still be used profitably. This function puts the DAG into a
// more appropriate form for those patterns to trigger.
static SDValue performAddSubLongCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
MVT VT = N->getSimpleValueType(0);
if (!VT.is128BitVector()) {
if (N->getOpcode() == ISD::ADD)
return performSetccAddFolding(N, DAG);
return SDValue();
// Make sure both branches are extended in the same way.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
LHS.getOpcode() != ISD::SIGN_EXTEND) ||
LHS.getOpcode() != RHS.getOpcode())
return SDValue();
unsigned ExtType = LHS.getOpcode();
// It's not worth doing if at least one of the inputs isn't already an
// extract, but we don't know which it'll be so we have to try both.
if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
if (!RHS.getNode())
return SDValue();
RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
} else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
if (!LHS.getNode())
return SDValue();
LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
// Massage DAGs which we can use the high-half "long" operations on into
// something isel will recognize better. E.g.
// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
// (aarch64_neon_umull (extract_high (v2i64 vec)))
// (extract_high (v2i64 (dup128 scalar)))))
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
assert(LHS.getValueType().is64BitVector() &&
RHS.getValueType().is64BitVector() &&
"unexpected shape for long operation");
// Either node could be a DUP, but it's not worth doing both of them (you'd
// just as well use the non-high version) so look for a corresponding extract
// operation on the other "wing".
if (isEssentiallyExtractHighSubvector(LHS)) {
RHS = tryExtendDUPToExtractHigh(RHS, DAG);
if (!RHS.getNode())
return SDValue();
} else if (isEssentiallyExtractHighSubvector(RHS)) {
LHS = tryExtendDUPToExtractHigh(LHS, DAG);
if (!LHS.getNode())
return SDValue();
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
N->getOperand(0), LHS, RHS);
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
MVT ElemTy = N->getSimpleValueType(0).getScalarType();
unsigned ElemBits = ElemTy.getSizeInBits();
int64_t ShiftAmount;
if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
APInt SplatValue, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
HasAnyUndefs, ElemBits) ||
SplatBitSize != ElemBits)
return SDValue();
ShiftAmount = SplatValue.getSExtValue();
} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
ShiftAmount = CVN->getSExtValue();
} else
return SDValue();
unsigned Opcode;
bool IsRightShift;
switch (IID) {
llvm_unreachable("Unknown shift intrinsic");
case Intrinsic::aarch64_neon_sqshl:
Opcode = AArch64ISD::SQSHL_I;
IsRightShift = false;
case Intrinsic::aarch64_neon_uqshl:
Opcode = AArch64ISD::UQSHL_I;
IsRightShift = false;
case Intrinsic::aarch64_neon_srshl:
Opcode = AArch64ISD::SRSHR_I;
IsRightShift = true;
case Intrinsic::aarch64_neon_urshl:
Opcode = AArch64ISD::URSHR_I;
IsRightShift = true;
case Intrinsic::aarch64_neon_sqshlu:
Opcode = AArch64ISD::SQSHLU_I;
IsRightShift = false;
case Intrinsic::aarch64_neon_sshl:
case Intrinsic::aarch64_neon_ushl:
// For positive shift amounts we can use SHL, as ushl/sshl perform a regular
// left shift for positive shift amounts. Below, we only replace the current
// node with VSHL, if this condition is met.
Opcode = AArch64ISD::VSHL;
IsRightShift = false;
if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
SDLoc dl(N);
return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
DAG.getConstant(-ShiftAmount, dl, MVT::i32));
} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
SDLoc dl(N);
return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
DAG.getConstant(ShiftAmount, dl, MVT::i32));
return SDValue();
// The CRC32[BH] instructions ignore the high bits of their data operand. Since
// the intrinsics must be legal and take an i32, this means there's almost
// certainly going to be a zext in the DAG which we can eliminate.
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
SDValue AndN = N->getOperand(2);
if (AndN.getOpcode() != ISD::AND)
return SDValue();
ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
if (!CMask || CMask->getZExtValue() != Mask)
return SDValue();
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
SelectionDAG &DAG) {
SDLoc dl(N);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
DAG.getNode(Opc, dl,
DAG.getConstant(0, dl, MVT::i64));
static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
SelectionDAG &DAG) {
SDLoc dl(N);
LLVMContext &Ctx = *DAG.getContext();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
SDValue Pred = N->getOperand(1);
SDValue Data = N->getOperand(2);
EVT DataVT = Data.getValueType();
if (DataVT.getVectorElementType().isScalarInteger() &&
(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)) {
if (!TLI.isTypeLegal(DataVT))
return SDValue();
EVT OutputVT = EVT::getVectorVT(Ctx, VT,
AArch64::NeonBitsPerVector / VT.getSizeInBits());
SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data);
SDValue Zero = DAG.getConstant(0, dl, MVT::i64);
SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero);
return Result;
return SDValue();
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Op1 = N->getOperand(1);
SDValue Op2 = N->getOperand(2);
EVT ScalarTy = Op1.getValueType();
if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
Op1, Op2);
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
SDLoc dl(N);
SDValue Scalar = N->getOperand(3);
EVT ScalarTy = Scalar.getValueType();
if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
SDValue Passthru = N->getOperand(1);
SDValue Pred = N->getOperand(2);
return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
Pred, Scalar, Passthru);
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
SDLoc dl(N);
LLVMContext &Ctx = *DAG.getContext();
EVT VT = N->getValueType(0);
assert(VT.isScalableVector() && "Expected a scalable vector.");
// Current lowering only supports the SVE-ACLE types.
if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
return SDValue();
unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true });
// Convert everything to the domain of EXT (i.e bytes).
SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
DAG.getConstant(ElemSize, dl, MVT::i32));
SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalize())
return SDValue();
SDValue Comparator = N->getOperand(3);
if (Comparator.getOpcode() == AArch64ISD::DUP ||
Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
unsigned IID = getIntrinsicID(N);
EVT VT = N->getValueType(0);
EVT CmpVT = N->getOperand(2).getValueType();
SDValue Pred = N->getOperand(1);
SDValue Imm;
SDLoc DL(N);
switch (IID) {
llvm_unreachable("Called with wrong intrinsic!");
// Signed comparisons
case Intrinsic::aarch64_sve_cmpeq_wide:
case Intrinsic::aarch64_sve_cmpne_wide:
case Intrinsic::aarch64_sve_cmpge_wide:
case Intrinsic::aarch64_sve_cmpgt_wide:
case Intrinsic::aarch64_sve_cmplt_wide:
case Intrinsic::aarch64_sve_cmple_wide: {
if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
int64_t ImmVal = CN->getSExtValue();
if (ImmVal >= -16 && ImmVal <= 15)
Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
return SDValue();
// Unsigned comparisons
case Intrinsic::aarch64_sve_cmphs_wide:
case Intrinsic::aarch64_sve_cmphi_wide:
case Intrinsic::aarch64_sve_cmplo_wide:
case Intrinsic::aarch64_sve_cmpls_wide: {
if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
uint64_t ImmVal = CN->getZExtValue();
if (ImmVal <= 127)
Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
return SDValue();
if (!Imm)
return SDValue();
SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
N->getOperand(2), Splat, DAG.getCondCode(CC));
return SDValue();
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
AArch64CC::CondCode Cond) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc DL(Op);
assert(Op.getValueType().isScalableVector() &&
TLI.isTypeLegal(Op.getValueType()) &&
"Expected legal scalable vector type!");
// Ensure target specific opcodes are using legal type.
EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
SDValue TVal = DAG.getConstant(1, DL, OutVT);
SDValue FVal = DAG.getConstant(0, DL, OutVT);
// Set condition code (CC) flags.
SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
// Convert CC to integer based on requested condition.
// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
return DAG.getZExtOrTrunc(Res, DL, VT);
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Pred = N->getOperand(1);
SDValue VecToReduce = N->getOperand(2);
EVT ReduceVT = VecToReduce.getValueType();
SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
// SVE reductions set the whole vector register with the first element
// containing the reduction result, which we'll now extract.
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Pred = N->getOperand(1);
SDValue InitVal = N->getOperand(2);
SDValue VecToReduce = N->getOperand(3);
EVT ReduceVT = VecToReduce.getValueType();
// Ordered reductions use the first lane of the result vector as the
// reduction's initial value.
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
DAG.getUNDEF(ReduceVT), InitVal, Zero);
SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
// SVE reductions set the whole vector register with the first element
// containing the reduction result, which we'll now extract.
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
unsigned IID = getIntrinsicID(N);
switch (IID) {
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
case Intrinsic::aarch64_neon_saddv:
return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
case Intrinsic::aarch64_neon_uaddv:
return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
case Intrinsic::aarch64_neon_sminv:
return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
case Intrinsic::aarch64_neon_uminv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
case Intrinsic::aarch64_neon_smaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
case Intrinsic::aarch64_neon_umaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
case Intrinsic::aarch64_neon_fmax:
return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fmin:
return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fmaxnm:
return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fminnm:
return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull:
case Intrinsic::aarch64_neon_pmull:
case Intrinsic::aarch64_neon_sqdmull:
return tryCombineLongOpWithDup(IID, N, DCI, DAG);
case Intrinsic::aarch64_neon_sqshl:
case Intrinsic::aarch64_neon_uqshl:
case Intrinsic::aarch64_neon_sqshlu:
case Intrinsic::aarch64_neon_srshl:
case Intrinsic::aarch64_neon_urshl:
case Intrinsic::aarch64_neon_sshl:
case Intrinsic::aarch64_neon_ushl:
return tryCombineShiftImm(IID, N, DAG);
case Intrinsic::aarch64_crc32b:
case Intrinsic::aarch64_crc32cb:
return tryCombineCRC32(0xff, N, DAG);
case Intrinsic::aarch64_crc32h:
case Intrinsic::aarch64_crc32ch:
return tryCombineCRC32(0xffff, N, DAG);
case Intrinsic::aarch64_sve_smaxv:
return LowerSVEIntReduction(N, AArch64ISD::SMAXV_PRED, DAG);
case Intrinsic::aarch64_sve_umaxv:
return LowerSVEIntReduction(N, AArch64ISD::UMAXV_PRED, DAG);
case Intrinsic::aarch64_sve_sminv:
return LowerSVEIntReduction(N, AArch64ISD::SMINV_PRED, DAG);
case Intrinsic::aarch64_sve_uminv:
return LowerSVEIntReduction(N, AArch64ISD::UMINV_PRED, DAG);
case Intrinsic::aarch64_sve_orv:
return LowerSVEIntReduction(N, AArch64ISD::ORV_PRED, DAG);
case Intrinsic::aarch64_sve_eorv:
return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
case Intrinsic::aarch64_sve_andv:
return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
case Intrinsic::aarch64_sve_index:
return LowerSVEIntrinsicIndex(N, DAG);
case Intrinsic::aarch64_sve_dup:
return LowerSVEIntrinsicDUP(N, DAG);
case Intrinsic::aarch64_sve_dup_x:
return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
case Intrinsic::aarch64_sve_ext:
return LowerSVEIntrinsicEXT(N, DAG);
case Intrinsic::aarch64_sve_smin:
return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_umin:
return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_smax:
return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_umax:
return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_lsl:
return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_lsr:
return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_asr:
return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_cmphs:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
N->getValueType(0), N->getOperand(1), N->getOperand(2),
N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
case Intrinsic::aarch64_sve_cmphi:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
N->getValueType(0), N->getOperand(1), N->getOperand(2),
N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
case Intrinsic::aarch64_sve_cmpge:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
N->getValueType(0), N->getOperand(1), N->getOperand(2),
N->getOperand(3), DAG.getCondCode(ISD::SETGE));
case Intrinsic::aarch64_sve_cmpgt:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
N->getValueType(0), N->getOperand(1), N->getOperand(2),
N->getOperand(3), DAG.getCondCode(ISD::SETGT));
case Intrinsic::aarch64_sve_cmpeq:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
N->getValueType(0), N->getOperand(1), N->getOperand(2),
N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
case Intrinsic::aarch64_sve_cmpne:
if (!N->getOperand(2).getValueType().isFloatingPoint())
return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
N->getValueType(0), N->getOperand(1), N->getOperand(2),
N->getOperand(3), DAG.getCondCode(ISD::SETNE));
case Intrinsic::aarch64_sve_fadda:
return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
case Intrinsic::aarch64_sve_faddv:
return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
case Intrinsic::aarch64_sve_fmaxnmv:
return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
case Intrinsic::aarch64_sve_fmaxv:
return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
case Intrinsic::aarch64_sve_fminnmv:
return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
case Intrinsic::aarch64_sve_fminv:
return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
case Intrinsic::aarch64_sve_sel:
return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2), N->getOperand(3));
case Intrinsic::aarch64_sve_cmpeq_wide:
return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
case Intrinsic::aarch64_sve_cmpne_wide:
return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
case Intrinsic::aarch64_sve_cmpge_wide:
return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
case Intrinsic::aarch64_sve_cmpgt_wide:
return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
case Intrinsic::aarch64_sve_cmplt_wide:
return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
case Intrinsic::aarch64_sve_cmple_wide:
return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
case Intrinsic::aarch64_sve_cmphs_wide:
return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
case Intrinsic::aarch64_sve_cmphi_wide:
return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
case Intrinsic::aarch64_sve_cmplo_wide:
return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
case Intrinsic::aarch64_sve_cmpls_wide:
return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
case Intrinsic::aarch64_sve_ptest_any:
return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
case Intrinsic::aarch64_sve_ptest_first:
return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
case Intrinsic::aarch64_sve_ptest_last:
return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
return SDValue();
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
// we can convert that DUP into another extract_high (of a bigger DUP), which
// helps the backend to decide that an sabdl2 would be useful, saving a real
// extract_high operation.
if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
SDNode *ABDNode = N->getOperand(0).getNode();
unsigned IID = getIntrinsicID(ABDNode);
if (IID == Intrinsic::aarch64_neon_sabd ||
IID == Intrinsic::aarch64_neon_uabd) {
SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
if (!NewABD.getNode())
return SDValue();
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
// This is effectively a custom type legalization for AArch64.
// Type legalization will split an extend of a small, legal, type to a larger
// illegal type by first splitting the destination type, often creating
// illegal source types, which then get legalized in isel-confusing ways,
// leading to really terrible codegen. E.g.,
// %result = v8i32 sext v8i8 %value
// becomes
// %losrc = extract_subreg %value, ...
// %hisrc = extract_subreg %value, ...
// %lo = v4i32 sext v4i8 %losrc
// %hi = v4i32 sext v4i8 %hisrc
// Things go rapidly downhill from there.
// For AArch64, the [sz]ext vector instructions can only go up one element
// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
// take two instructions.
// This implies that the most efficient way to do the extend from v8i8
// to two v4i32 values is to first extend the v8i8 to v8i16, then do
// the normal splitting to happen for the v8i16->v8i32.
// This is pre-legalization to catch some cases where the default
// type legalization will create ill-tempered code.
if (!DCI.isBeforeLegalizeOps())
return SDValue();
// We're only interested in cleaning things up for non-legal vector types
// here. If both the source and destination are legal, things will just
// work naturally without any fiddling.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT ResVT = N->getValueType(0);
if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
return SDValue();
// If the vector type isn't a simple VT, it's beyond the scope of what
// we're worried about here. Let legalization do its thing and hope for
// the best.
SDValue Src = N->getOperand(0);
EVT SrcVT = Src->getValueType(0);
if (!ResVT.isSimple() || !SrcVT.isSimple())
return SDValue();
// If the source VT is a 64-bit fixed or scalable vector, we can play games
// and get the better results we want.
if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
return SDValue();
unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
ElementCount SrcEC = SrcVT.getVectorElementCount();
SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC);
SDLoc DL(N);
Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
// Now split the rest of the operation into two halves, each with a 64
// bit source.
SDValue Lo, Hi;
LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());
EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
DAG.getConstant(0, DL, MVT::i64));
DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
// Now combine the parts back together so we still have a single result
// like the combiner expects.
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
SDValue SplatVal, unsigned NumVecElts) {
assert(!St.isTruncatingStore() && "cannot split truncating vector store");
unsigned OrigAlignment = St.getAlignment();
unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
// Create scalar stores. This is at least as good as the code sequence for a
// split unaligned store which is a dup.s, ext.b, and two stores.
// Most of the time the three stores should be replaced by store pair
// instructions (stp).
SDLoc DL(&St);
SDValue BasePtr = St.getBasePtr();
uint64_t BaseOffset = 0;
const MachinePointerInfo &PtrInfo = St.getPointerInfo();
SDValue NewST1 =
DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
OrigAlignment, St.getMemOperand()->getFlags());
// As this in ISel, we will not merge this add which may degrade results.
if (BasePtr->getOpcode() == ISD::ADD &&
isa<ConstantSDNode>(BasePtr->getOperand(1))) {
BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
BasePtr = BasePtr->getOperand(0);
unsigned Offset = EltOffset;
while (--NumVecElts) {
unsigned Alignment = MinAlign(OrigAlignment, Offset);
SDValue OffsetPtr =
DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
PtrInfo.getWithOffset(Offset), Alignment,
Offset += EltOffset;
return NewST1;
// Returns an SVE type that ContentTy can be trivially sign or zero extended
// into.
static MVT getSVEContainerType(EVT ContentTy) {
assert(ContentTy.isSimple() && "No SVE containers for extended types");
switch (ContentTy.getSimpleVT().SimpleTy) {
llvm_unreachable("No known SVE container for this MVT type");
case MVT::nxv2i8:
case MVT::nxv2i16:
case MVT::nxv2i32:
case MVT::nxv2i64:
case MVT::nxv2f32:
case MVT::nxv2f64:
return MVT::nxv2i64;
case MVT::nxv4i8:
case MVT::nxv4i16:
case MVT::nxv4i32:
case MVT::nxv4f32:
return MVT::nxv4i32;
case MVT::nxv8i8:
case MVT::nxv8i16:
case MVT::nxv8f16:
case MVT::nxv8bf16:
return MVT::nxv8i16;
case MVT::nxv16i8:
return MVT::nxv16i8;
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
return SDValue();
EVT ContainerVT = VT;
if (ContainerVT.isInteger())
ContainerVT = getSVEContainerType(ContainerVT);
SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
SDValue Ops[] = { N->getOperand(0), // Chain
N->getOperand(2), // Pg
N->getOperand(3), // Base
DAG.getValueType(VT) };
SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
SDValue LoadChain = SDValue(Load.getNode(), 1);
if (ContainerVT.isInteger() && (VT != ContainerVT))
Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
return DAG.getMergeValues({ Load, LoadChain }, DL);
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
EVT PtrTy = N->getOperand(3).getValueType();
if (VT == MVT::nxv8bf16 &&
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();
EVT LoadVT = VT;
if (VT.isFloatingPoint())
LoadVT = VT.changeTypeToInteger();
auto *MINode = cast<MemIntrinsicSDNode>(N);
SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
MINode->getOperand(3), DAG.getUNDEF(PtrTy),
MINode->getOperand(2), PassThru,
MINode->getMemoryVT(), MINode->getMemOperand(),
if (VT.isFloatingPoint()) {
SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
return DAG.getMergeValues(Ops, DL);
return L;
template <unsigned Opcode>
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
"Unsupported opcode.");
SDLoc DL(N);
EVT VT = N->getValueType(0);
+ if (VT == MVT::nxv8bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
EVT LoadVT = VT;
if (VT.isFloatingPoint())
LoadVT = VT.changeTypeToInteger();
SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
SDValue LoadChain = SDValue(Load.getNode(), 1);
if (VT.isFloatingPoint())
Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
return DAG.getMergeValues({Load, LoadChain}, DL);
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Data = N->getOperand(2);
EVT DataVT = Data.getValueType();
EVT HwSrcVt = getSVEContainerType(DataVT);
SDValue InputVT = DAG.getValueType(DataVT);
if (DataVT == MVT::nxv8bf16 &&
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();
if (DataVT.isFloatingPoint())
InputVT = DAG.getValueType(HwSrcVt);
SDValue SrcNew;
if (Data.getValueType().isFloatingPoint())
SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
SDValue Ops[] = { N->getOperand(0), // Chain
N->getOperand(4), // Base
N->getOperand(3), // Pg
return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Data = N->getOperand(2);
EVT DataVT = Data.getValueType();
EVT PtrTy = N->getOperand(4).getValueType();
if (DataVT == MVT::nxv8bf16 &&
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();
if (DataVT.isFloatingPoint())
Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
auto *MINode = cast<MemIntrinsicSDNode>(N);
return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
DAG.getUNDEF(PtrTy), MINode->getOperand(3),
MINode->getMemoryVT(), MINode->getMemOperand(),
ISD::UNINDEXED, false, false);
/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
/// load store optimizer pass will merge them to store pair stores. This should
/// be better than a movi to create the vector zero followed by a vector store
/// if the zero constant is not re-used, since one instructions and one register
/// live range will be removed.
/// For example, the final generated code should be:
/// stp xzr, xzr, [x0]
/// instead of:
/// movi v0.2d, #0
/// str q0, [x0]
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
// Avoid scalarizing zero splat stores for scalable vectors.
if (VT.isScalableVector())
return SDValue();
// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
// 2, 3 or 4 i32 elements.
int NumVecElts = VT.getVectorNumElements();
if (!(((NumVecElts == 2 || NumVecElts == 3) &&
VT.getVectorElementType().getSizeInBits() == 64) ||
((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
VT.getVectorElementType().getSizeInBits() == 32)))
return SDValue();
if (StVal.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// If the zero constant has more than one use then the vector store could be
// better since the constant mov will be amortized and stp q instructions
// should be able to be formed.
if (!StVal.hasOneUse())
return SDValue();
// If the store is truncating then it's going down to i16 or smaller, which
// means it can be implemented in a single store anyway.
if (St.isTruncatingStore())
return SDValue();
// If the immediate offset of the address operand is too large for the stp
// instruction, then bail out.
if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
if (Offset < -512 || Offset > 504)
return SDValue();
for (int I = 0; I < NumVecElts; ++I) {
SDValue EltVal = StVal.getOperand(I);
if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
return SDValue();
// Use a CopyFromReg WZR/XZR here to prevent
// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
SDLoc DL(&St);
unsigned ZeroReg;
if (VT.getVectorElementType().getSizeInBits() == 32) {
ZeroReg = AArch64::WZR;
ZeroVT = MVT::i32;
} else {
ZeroReg = AArch64::XZR;
ZeroVT = MVT::i64;
SDValue SplatVal =
DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
/// value. The load store optimizer pass will merge them to store pair stores.
/// This has better performance than a splat of the scalar followed by a split
/// vector store. Even if the stores are not merged it is four stores vs a dup,
/// followed by an ext.b and two stores.
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
// Don't replace floating point stores, they possibly won't be transformed to
// stp because of the store pair suppress pass.
if (VT.isFloatingPoint())
return SDValue();
// We can express a splat as store pair(s) for 2 or 4 elements.
unsigned NumVecElts = VT.getVectorNumElements();
if (NumVecElts != 4 && NumVecElts != 2)
return SDValue();
// If the store is truncating then it's going down to i16 or smaller, which
// means it can be implemented in a single store anyway.
if (St.isTruncatingStore())
return SDValue();
// Check that this is a splat.
// Make sure that each of the relevant vector element locations are inserted
// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
SDValue SplatVal;
for (unsigned I = 0; I < NumVecElts; ++I) {
// Check for insert vector elements.
if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
return SDValue();
// Check that same value is inserted at each vector element.
if (I == 0)
SplatVal = StVal.getOperand(1);
else if (StVal.getOperand(1) != SplatVal)
return SDValue();
// Check insert element index.
ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
if (!CIndex)
return SDValue();
uint64_t IndexVal = CIndex->getZExtValue();
if (IndexVal >= NumVecElts)
return SDValue();
StVal = StVal.getOperand(0);
// Check that all vector element locations were inserted to.
if (IndexNotInserted.any())
return SDValue();
return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
StoreSDNode *S = cast<StoreSDNode>(N);
if (S->isVolatile() || S->isIndexed())
return SDValue();
SDValue StVal = S->getValue();
EVT VT = StVal.getValueType();
if (!VT.isFixedLengthVector())
return SDValue();
// If we get a splat of zeros, convert this vector store to a store of
// scalars. They will be merged into store pairs of xzr thereby removing one
// instruction and one register.
if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
return ReplacedZeroSplat;
// FIXME: The logic for deciding if an unaligned store should be split should
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
if (!Subtarget->isMisaligned128StoreSlow())
return SDValue();
// Don't split at -Oz.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
// those up regresses performance on micro-benchmarks and olden/bh.
if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
return SDValue();
// Split unaligned 16B stores. They are terrible for performance.
// Don't split stores with alignment of 1 or 2. Code that uses clang vector
// extensions can use this to mark that it does not want splitting to happen
// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
// eliminating alignment hazards is only 1 in 8 for alignment of 2.
if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
S->getAlignment() <= 2)
return SDValue();
// If we get a splat of a scalar convert this vector store to a store of
// scalars. They will be merged into store pairs thereby removing two
// instructions.
if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
return ReplacedSplat;
SDLoc DL(S);
// Split VT into two.
EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
unsigned NumElts = HalfVT.getVectorNumElements();
SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
DAG.getConstant(0, DL, MVT::i64));
SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
DAG.getConstant(NumElts, DL, MVT::i64));
SDValue BasePtr = S->getBasePtr();
SDValue NewST1 =
DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
S->getAlignment(), S->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(8, DL, MVT::i64));
return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
S->getPointerInfo(), S->getAlignment(),
/// Target-specific DAG combine function for post-increment LD1 (lane) and
/// post-increment LD1R.
static SDValue performPostLD1Combine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
bool IsLaneOp) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (VT.isScalableVector())
return SDValue();
unsigned LoadIdx = IsLaneOp ? 1 : 0;
SDNode *LD = N->getOperand(LoadIdx).getNode();
// If it is not LOAD, can not do such combine.
if (LD->getOpcode() != ISD::LOAD)
return SDValue();
// The vector lane must be a constant in the LD1LANE opcode.
SDValue Lane;
if (IsLaneOp) {
Lane = N->getOperand(2);
auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
EVT MemVT = LoadSDN->getMemoryVT();
// Check if memory operand is the same type as the vector element.
if (MemVT != VT.getVectorElementType())
return SDValue();
// Check if there are other uses. If so, do not combine as it will introduce
// an extra load.
for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
++UI) {
if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
if (*UI != N)
return SDValue();
SDValue Addr = LD->getOperand(1);
SDValue Vector = N->getOperand(0);
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User->getOpcode() != ISD::ADD
|| UI.getUse().getResNo() != Addr.getResNo())
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
uint32_t IncVal = CInc->getZExtValue();
unsigned NumBytes = VT.getScalarSizeInBits() / 8;
if (IncVal != NumBytes)
Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
// To avoid cycle construction make sure that neither the load nor the add
// are predecessors to each other or the Vector.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
SmallVector<SDValue, 8> Ops;
Ops.push_back(LD->getOperand(0)); // Chain
if (IsLaneOp) {
Ops.push_back(Vector); // The vector to be inserted
Ops.push_back(Lane); // The lane to be inserted in the vector
EVT Tys[3] = { VT, MVT::i64, MVT::Other };
SDVTList SDTys = DAG.getVTList(Tys);
unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
// Update the uses.
SDValue NewResults[] = {
SDValue(LD, 0), // The result of load
SDValue(UpdN.getNode(), 2) // Chain
DCI.CombineTo(LD, NewResults);
DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
return SDValue();
/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
/// address translation.
static bool performTBISimplification(SDValue Addr,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
APInt DemandedMask = APInt::getLowBitsSet(64, 56);
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
return true;
return false;
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
return Split;
if (Subtarget->supportsAddressTopByteIgnored() &&
performTBISimplification(N->getOperand(2), DCI, DAG))
return SDValue(N, 0);
return SDValue();
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
static SDValue performNEONPostLDSTCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
unsigned AddrOpIdx = N->getNumOperands() - 1;
SDValue Addr = N->getOperand(AddrOpIdx);
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User->getOpcode() != ISD::ADD ||
UI.getUse().getResNo() != Addr.getResNo())
// Check that the add is independent of the load/store. Otherwise, folding
// it would create a cycle.
SmallPtrSet<const SDNode *, 32> Visited;
SmallVector<const SDNode *, 16> Worklist;
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
SDNode::hasPredecessorHelper(User, Visited, Worklist))
// Find the new opcode for the updating load/store.
bool IsStore = false;
bool IsLaneOp = false;
bool IsDupOp = false;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntNo) {
default: llvm_unreachable("unexpected intrinsic for Neon base update");
case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
NumVecs = 2; break;
case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
NumVecs = 3; break;
case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
NumVecs = 4; break;
case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
NumVecs = 2; IsStore = true; break;
case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
NumVecs = 3; IsStore = true; break;
case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
NumVecs = 4; IsStore = true; break;
case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
NumVecs = 2; break;
case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
NumVecs = 3; break;
case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
NumVecs = 4; break;
case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
NumVecs = 2; IsStore = true; break;
case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
NumVecs = 3; IsStore = true; break;
case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
NumVecs = 4; IsStore = true; break;
case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
NumVecs = 2; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
NumVecs = 3; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
NumVecs = 4; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
NumVecs = 2; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
NumVecs = 3; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
NumVecs = 4; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
NumVecs = 2; IsStore = true; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
NumVecs = 3; IsStore = true; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
NumVecs = 4; IsStore = true; IsLaneOp = true; break;
EVT VecTy;
if (IsStore)
VecTy = N->getOperand(2).getValueType();
VecTy = N->getValueType(0);
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
uint32_t IncVal = CInc->getZExtValue();
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
if (IsLaneOp || IsDupOp)
NumBytes /= VecTy.getVectorNumElements();
if (IncVal != NumBytes)
Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
SmallVector<SDValue, 8> Ops;
Ops.push_back(N->getOperand(0)); // Incoming chain
// Load lane and store have vector list as input.
if (IsLaneOp || IsStore)
for (unsigned i = 2; i < AddrOpIdx; ++i)
Ops.push_back(Addr); // Base register
// Return Types.
EVT Tys[6];
unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
unsigned n;
for (n = 0; n < NumResultVecs; ++n)
Tys[n] = VecTy;
Tys[n++] = MVT::i64; // Type of write back register
Tys[n] = MVT::Other; // Type of the chain
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
// Update the uses.
std::vector<SDValue> NewResults;
for (unsigned i = 0; i < NumResultVecs; ++i) {
NewResults.push_back(SDValue(UpdN.getNode(), i));
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
DCI.CombineTo(N, NewResults);
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
return SDValue();
// Checks to see if the value is the prescribed width and returns information
// about its extension mode.
bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
switch(V.getNode()->getOpcode()) {
return false;
case ISD::LOAD: {
LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
|| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
ExtType = LoadNode->getExtensionType();
return true;
return false;
case ISD::AssertSext: {
VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
if ((TypeNode->getVT() == MVT::i8 && width == 8)
|| (TypeNode->getVT() == MVT::i16 && width == 16)) {
return true;
return false;
case ISD::AssertZext: {
VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
if ((TypeNode->getVT() == MVT::i8 && width == 8)
|| (TypeNode->getVT() == MVT::i16 && width == 16)) {
return true;
return false;
case ISD::Constant:
case ISD::TargetConstant: {
return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
1LL << (width - 1);
return true;
// This function does a whole lot of voodoo to determine if the tests are
// equivalent without and with a mask. Essentially what happens is that given a
// DAG resembling:
// +-------------+ +-------------+ +-------------+ +-------------+
// | Input | | AddConstant | | CompConstant| | CC |
// +-------------+ +-------------+ +-------------+ +-------------+
// | | | |
// V V | +----------+
// +-------------+ +----+ | |
// | ADD | |0xff| | |
// +-------------+ +----+ | |
// | | | |
// V V | |
// +-------------+ | |
// | AND | | |
// +-------------+ | |
// | | |
// +-----+ | |
// | | |
// V V V
// +-------------+
// | CMP |
// +-------------+
// The AND node may be safely removed for some combinations of inputs. In
// particular we need to take into account the extension type of the Input,
// the exact values of AddConstant, CompConstant, and CC, along with the nominal
// width of the input (this can work for any width inputs, the above graph is
// specific to 8 bits.
// The specific equations were worked out by generating output tables for each
// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
// problem was simplified by working with 4 bit inputs, which means we only
// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
// patterns present in both extensions (0,7). For every distinct set of
// AddConstant and CompConstants bit patterns we can consider the masked and
// unmasked versions to be equivalent if the result of this function is true for
// all 16 distinct bit patterns of for the current extension type of Input (w0).
// sub w8, w0, w1
// and w10, w8, #0x0f
// cmp w8, w2
// cset w9, AArch64CC
// cmp w10, w2
// cset w11, AArch64CC
// cmp w9, w11
// cset w0, eq
// ret
// Since the above function shows when the outputs are equivalent it defines
// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
// would be expensive to run during compiles. The equations below were written
// in a test harness that confirmed they gave equivalent outputs to the above
// for all inputs function, so they can be used determine if the removal is
// legal instead.
// isEquivalentMaskless() is the code for testing if the AND can be removed
// factored out of the DAG recognition as the DAG can take several forms.
static bool isEquivalentMaskless(unsigned CC, unsigned width,
ISD::LoadExtType ExtType, int AddConstant,
int CompConstant) {
// By being careful about our equations and only writing the in term
// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
// make them generally applicable to all bit widths.
int MaxUInt = (1 << width);
// For the purposes of these comparisons sign extending the type is
// equivalent to zero extending the add and displacing it by half the integer
// width. Provided we are careful and make sure our equations are valid over
// the whole range we can just adjust the input and avoid writing equations
// for sign extended inputs.
if (ExtType == ISD::SEXTLOAD)
AddConstant -= (1 << (width-1));
switch(CC) {
case AArch64CC::LE:
case AArch64CC::GT:
if ((AddConstant == 0) ||
(CompConstant == MaxUInt - 1 && AddConstant < 0) ||
(AddConstant >= 0 && CompConstant < 0) ||
(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
return true;
case AArch64CC::LT:
case AArch64CC::GE:
if ((AddConstant == 0) ||
(AddConstant >= 0 && CompConstant <= 0) ||
(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
return true;
case AArch64CC::HI:
case AArch64CC::LS:
if ((AddConstant >= 0 && CompConstant < 0) ||
(AddConstant <= 0 && CompConstant >= -1 &&
CompConstant < AddConstant + MaxUInt))
return true;
case AArch64CC::PL:
case AArch64CC::MI:
if ((AddConstant == 0) ||
(AddConstant > 0 && CompConstant <= 0) ||
(AddConstant < 0 && CompConstant <= AddConstant))
return true;
case AArch64CC::LO:
case AArch64CC::HS:
if ((AddConstant >= 0 && CompConstant <= 0) ||
(AddConstant <= 0 && CompConstant >= 0 &&
CompConstant <= AddConstant + MaxUInt))
return true;
case AArch64CC::EQ:
case AArch64CC::NE:
if ((AddConstant > 0 && CompConstant < 0) ||
(AddConstant < 0 && CompConstant >= 0 &&
CompConstant < AddConstant + MaxUInt) ||
(AddConstant >= 0 && CompConstant >= 0 &&
CompConstant >= AddConstant) ||
(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
return true;
case AArch64CC::VS:
case AArch64CC::VC:
case AArch64CC::AL:
case AArch64CC::NV:
return true;
case AArch64CC::Invalid:
return false;
SDValue performCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG, unsigned CCIndex,
unsigned CmpIndex) {
unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
unsigned CondOpcode = SubsNode->getOpcode();
if (CondOpcode != AArch64ISD::SUBS)
return SDValue();
// There is a SUBS feeding this condition. Is it fed by a mask we can
// use?
SDNode *AndNode = SubsNode->getOperand(0).getNode();
unsigned MaskBits = 0;
if (AndNode->getOpcode() != ISD::AND)
return SDValue();
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
uint32_t CNV = CN->getZExtValue();
if (CNV == 255)
MaskBits = 8;
else if (CNV == 65535)
MaskBits = 16;
if (!MaskBits)
return SDValue();
SDValue AddValue = AndNode->getOperand(0);
if (AddValue.getOpcode() != ISD::ADD)
return SDValue();
// The basic dag structure is correct, grab the inputs and validate them.
SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
SDValue SubsInputValue = SubsNode->getOperand(1);
// The mask is present and the provenance of all the values is a smaller type,
// lets see if the mask is superfluous.
if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
return SDValue();
ISD::LoadExtType ExtType;
if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
!checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
return SDValue();
if(!isEquivalentMaskless(CC, MaskBits, ExtType,
return SDValue();
// The AND is not necessary, remove it.
SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
return SDValue(N, 0);
// Optimize compare with zero and branch.
static SDValue performBRCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
// will not be produced, as they are conditional branch instructions that do
// not set flags.
if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
return SDValue();
if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
N = NV.getNode();
SDValue Chain = N->getOperand(0);
SDValue Dest = N->getOperand(1);
SDValue CCVal = N->getOperand(2);
SDValue Cmp = N->getOperand(3);
assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
return SDValue();
unsigned CmpOpc = Cmp.getOpcode();
if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
return SDValue();
// Only attempt folding if there is only one use of the flag and no use of the
// value.
if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
return SDValue();
SDValue LHS = Cmp.getOperand(0);
SDValue RHS = Cmp.getOperand(1);
assert(LHS.getValueType() == RHS.getValueType() &&
"Expected the value type to be the same for both operands!");
if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
return SDValue();
if (isNullConstant(LHS))
std::swap(LHS, RHS);
if (!isNullConstant(RHS))
return SDValue();
if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
LHS.getOpcode() == ISD::SRL)
return SDValue();
// Fold the compare into the branch instruction.
SDValue BR;
if (CC == AArch64CC::EQ)
BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, BR, false);
return SDValue();
// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
// as well as whether the test should be inverted. This code is required to
// catch these cases (as opposed to standard dag combines) because
// AArch64ISD::TBZ is matched during legalization.
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
SelectionDAG &DAG) {
if (!Op->hasOneUse())
return Op;
// We don't handle undef/constant-fold cases below, as they should have
// already been taken care of (e.g. and of 0, test of undefined shifted bits,
// etc.)
// (tbz (trunc x), b) -> (tbz x, b)
// This case is just here to enable more of the below cases to be caught.
if (Op->getOpcode() == ISD::TRUNCATE &&
Bit < Op->getValueType(0).getSizeInBits()) {
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
if (Op->getOpcode() == ISD::ANY_EXTEND &&
Bit < Op->getOperand(0).getValueSizeInBits()) {
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
if (Op->getNumOperands() != 2)
return Op;
auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!C)
return Op;
switch (Op->getOpcode()) {
return Op;
// (tbz (and x, m), b) -> (tbz x, b)
case ISD::AND:
if ((C->getZExtValue() >> Bit) & 1)
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
return Op;
// (tbz (shl x, c), b) -> (tbz x, b-c)
case ISD::SHL:
if (C->getZExtValue() <= Bit &&
(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
Bit = Bit - C->getZExtValue();
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
return Op;
// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
case ISD::SRA:
Bit = Bit + C->getZExtValue();
if (Bit >= Op->getValueType(0).getSizeInBits())
Bit = Op->getValueType(0).getSizeInBits() - 1;
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
// (tbz (srl x, c), b) -> (tbz x, b+c)
case ISD::SRL:
if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
Bit = Bit + C->getZExtValue();
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
return Op;
// (tbz (xor x, -1), b) -> (tbnz x, b)
case ISD::XOR:
if ((C->getZExtValue() >> Bit) & 1)
Invert = !Invert;
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
// Optimize test single bit zero/non-zero and branch.
static SDValue performTBZCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
bool Invert = false;
SDValue TestSrc = N->getOperand(1);
SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
if (TestSrc == NewTestSrc)
return SDValue();
unsigned NewOpc = N->getOpcode();
if (Invert) {
if (NewOpc == AArch64ISD::TBZ)
NewOpc = AArch64ISD::TBNZ;
else {
assert(NewOpc == AArch64ISD::TBNZ);
NewOpc = AArch64ISD::TBZ;
SDLoc DL(N);
return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
// vselect (v1i1 setcc) ->
// vselect (v1iXX setcc) (XX is the size of the compared operand type)
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
// such VSELECT.
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
EVT CCVT = N0.getValueType();
if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
CCVT.getVectorElementType() != MVT::i1)
return SDValue();
EVT ResVT = N->getValueType(0);
EVT CmpVT = N0.getOperand(0).getValueType();
// Only combine when the result type is of the same size as the compared
// operands.
if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
return SDValue();
SDValue IfTrue = N->getOperand(1);
SDValue IfFalse = N->getOperand(2);
SDValue SetCC =
DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
N0.getOperand(0), N0.getOperand(1),
return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
IfTrue, IfFalse);
/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
/// the compare-mask instructions rather than going via NZCV, even if LHS and
/// RHS are really scalar. This replaces any scalar setcc in the above pattern
/// with a vector one followed by a DUP shuffle on the result.
static SDValue performSelectCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
EVT ResVT = N->getValueType(0);
if (N0.getOpcode() != ISD::SETCC)
return SDValue();
// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
// scalar SetCCResultType. We also don't expect vectors, because we assume
// that selects fed by vector SETCCs are canonicalized to VSELECT.
assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
"Scalar-SETCC feeding SELECT has unexpected result type!");
// If NumMaskElts == 0, the comparison is larger than select result. The
// largest real NEON comparison is 64-bits per lane, which means the result is
// at most 32-bits and an illegal vector. Just bail out for now.
EVT SrcVT = N0.getOperand(0).getValueType();
// Don't try to do this optimization when the setcc itself has i1 operands.
// There are no legal vectors of i1, so this would be pointless.
if (SrcVT == MVT::i1)
return SDValue();
int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
if (!ResVT.isVector() || NumMaskElts == 0)
return SDValue();
SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
// Also bail out if the vector CCVT isn't the same size as ResVT.
// This can happen if the SETCC operand size doesn't divide the ResVT size
// (e.g., f64 vs v3f32).
if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
return SDValue();
// Make sure we didn't create illegal types, if we're not supposed to.
assert(DCI.isBeforeLegalize() ||
// First perform a vector comparison, where lane 0 is the one we're interested
// in.
SDLoc DL(N0);
SDValue LHS =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
SDValue RHS =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
// Now duplicate the comparison mask we want across all other lanes.
SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
Mask = DAG.getNode(ISD::BITCAST, DL,
ResVT.changeVectorElementTypeToInteger(), Mask);
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
return N->getOperand(0);
return SDValue();
// If all users of the globaladdr are of the form (globaladdr + constant), find
// the smallest constant, fold it into the globaladdr's offset and rewrite the
// globaladdr as (globaladdr + constant) - constant.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget,
const TargetMachine &TM) {
auto *GN = cast<GlobalAddressSDNode>(N);
if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
return SDValue();
uint64_t MinOffset = -1ull;
for (SDNode *N : GN->uses()) {
if (N->getOpcode() != ISD::ADD)
return SDValue();
auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
if (!C)
C = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!C)
return SDValue();
MinOffset = std::min(MinOffset, C->getZExtValue());
uint64_t Offset = MinOffset + GN->getOffset();
// Require that the new offset is larger than the existing one. Otherwise, we
// can end up oscillating between two possible DAGs, for example,
// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
if (Offset <= uint64_t(GN->getOffset()))
return SDValue();
// Check whether folding this offset is legal. It must not go out of bounds of
// the referenced object to avoid violating the code model, and must be
// smaller than 2^21 because this is the largest offset expressible in all
// object formats.
// This check also prevents us from folding negative offsets, which will end
// up being treated in the same way as large positive ones. They could also
// cause code model violations, and aren't really common enough to matter.
if (Offset >= (1 << 21))
return SDValue();
const GlobalValue *GV = GN->getGlobal();
Type *T = GV->getValueType();
if (!T->isSized() ||
Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
return SDValue();
SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
DAG.getConstant(MinOffset, DL, MVT::i64));
// Turns the vector of indices into a vector of byte offstes by scaling Offset
// by (BitWidth / 8).
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
SDLoc DL, unsigned BitWidth) {
assert(Offset.getValueType().isScalableVector() &&
"This method is only for scalable vectors of offsets");
SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
/// Check if the value of \p OffsetInBytes can be used as an immediate for
/// the gather load/prefetch and scatter store instructions with vector base and
/// immediate offset addressing mode:
/// [<Zn>.[S|D]{, #<imm>}]
/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
unsigned ScalarSizeInBytes) {
// The immediate is not a multiple of the scalar size.
if (OffsetInBytes % ScalarSizeInBytes)
return false;
// The immediate is out of range.
if (OffsetInBytes / ScalarSizeInBytes > 31)
return false;
return true;
/// Check if the value of \p Offset represents a valid immediate for the SVE
/// gather load/prefetch and scatter store instructiona with vector base and
/// immediate offset addressing mode:
/// [<Zn>.[S|D]{, #<imm>}]
/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
unsigned ScalarSizeInBytes) {
ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
return OffsetConst && isValidImmForSVEVecImmAddrMode(
OffsetConst->getZExtValue(), ScalarSizeInBytes);
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
unsigned Opcode,
bool OnlyPackedOffsets = true) {
const SDValue Src = N->getOperand(2);
const EVT SrcVT = Src->getValueType(0);
assert(SrcVT.isScalableVector() &&
"Scatter stores are only possible for SVE vectors");
SDLoc DL(N);
MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
// Make sure that source data will fit into an SVE register
if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
return SDValue();
// For FPs, ACLE only supports _packed_ single and double precision types.
if (SrcElVT.isFloatingPoint())
if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
return SDValue();
// Depending on the addressing mode, this is either a pointer or a vector of
// pointers (that fits into one register)
SDValue Base = N->getOperand(4);
// Depending on the addressing mode, this is either a single offset or a
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(5);
// For "scalar + vector of indices", just scale the indices. This only
// applies to non-temporal scatters because there's no instruction that takes
// indicies.
if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
Offset =
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
Opcode = AArch64ISD::SSTNT1_PRED;
// In the case of non-temporal gather loads there's only one SVE instruction
// per data-size: "scalar + vector", i.e.
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// Since we do have intrinsics that allow the arguments to be in a different
// order, we may need to swap them to match the spec.
if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
std::swap(Base, Offset);
// SST1_IMM requires that the offset is an immediate that is:
// * a multiple of #SizeInBytes,
// * in the range [0, 31 x #SizeInBytes],
// where #SizeInBytes is the size in bytes of the stored items. For
// immediates outside that range and non-immediate scalar offsets use SST1 or
// SST1_UXTW instead.
if (Opcode == AArch64ISD::SST1_IMM_PRED) {
if (!isValidImmForSVEVecImmAddrMode(Offset,
SrcVT.getScalarSizeInBits() / 8)) {
if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
Opcode = AArch64ISD::SST1_UXTW_PRED;
Opcode = AArch64ISD::SST1_PRED;
std::swap(Base, Offset);
auto &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(Base.getValueType()))
return SDValue();
// Some scatter store variants allow unpacked offsets, but only as nxv2i32
// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
// nxv2i64. Legalize accordingly.
if (!OnlyPackedOffsets &&
Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
if (!TLI.isTypeLegal(Offset.getValueType()))
return SDValue();
// Source value type that is representable in hardware
EVT HwSrcVt = getSVEContainerType(SrcVT);
// Keep the original type of the input data to store - this is needed to be
// able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
// FP values we want the integer equivalent, so just use HwSrcVt.
SDValue InputVT = DAG.getValueType(SrcVT);
if (SrcVT.isFloatingPoint())
InputVT = DAG.getValueType(HwSrcVt);
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue SrcNew;
if (Src.getValueType().isFloatingPoint())
SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
SDValue Ops[] = {N->getOperand(0), // Chain
N->getOperand(3), // Pg
return DAG.getNode(Opcode, DL, VTs, Ops);
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
unsigned Opcode,
bool OnlyPackedOffsets = true) {
const EVT RetVT = N->getValueType(0);
assert(RetVT.isScalableVector() &&
"Gather loads are only possible for SVE vectors");
SDLoc DL(N);
// Make sure that the loaded data will fit into an SVE register
if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
return SDValue();
// Depending on the addressing mode, this is either a pointer or a vector of
// pointers (that fits into one register)
SDValue Base = N->getOperand(3);
// Depending on the addressing mode, this is either a single offset or a
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(4);
// For "scalar + vector of indices", just scale the indices. This only
// applies to non-temporal gathers because there's no instruction that takes
// indicies.
if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
// In the case of non-temporal gather loads there's only one SVE instruction
// per data-size: "scalar + vector", i.e.
// * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// Since we do have intrinsics that allow the arguments to be in a different
// order, we may need to swap them to match the spec.
if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
std::swap(Base, Offset);
// GLD{FF}1_IMM requires that the offset is an immediate that is:
// * a multiple of #SizeInBytes,
// * in the range [0, 31 x #SizeInBytes],
// where #SizeInBytes is the size in bytes of the loaded items. For
// immediates outside that range and non-immediate scalar offsets use
if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
if (!isValidImmForSVEVecImmAddrMode(Offset,
RetVT.getScalarSizeInBits() / 8)) {
if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
std::swap(Base, Offset);
auto &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(Base.getValueType()))
return SDValue();
// Some gather load variants allow unpacked offsets, but only as nxv2i32
// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
// nxv2i64. Legalize accordingly.
if (!OnlyPackedOffsets &&
Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
// Return value type that is representable in hardware
EVT HwRetVt = getSVEContainerType(RetVT);
// Keep the original output value type around - this is needed to be able to
// select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
// values we want the integer equivalent, so just use HwRetVT.
SDValue OutVT = DAG.getValueType(RetVT);
if (RetVT.isFloatingPoint())
OutVT = DAG.getValueType(HwRetVt);
SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
SDValue Ops[] = {N->getOperand(0), // Chain
N->getOperand(2), // Pg
Base, Offset, OutVT};
SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
SDValue LoadChain = SDValue(Load.getNode(), 1);
if (RetVT.isInteger() && (RetVT != HwRetVt))
Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
// If the original return value was FP, bitcast accordingly. Doing it here
// means that we can avoid adding TableGen patterns for FPs.
if (RetVT.isFloatingPoint())
Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
return DAG.getMergeValues({Load, LoadChain}, DL);
static SDValue
performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SDLoc DL(N);
SDValue Src = N->getOperand(0);
unsigned Opc = Src->getOpcode();
// Sign extend of an unsigned unpack -> signed unpack
if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
// Push the sign extend to the operand of the unpack
// This is necessary where, for example, the operand of the unpack
// is another unpack:
// 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
// ->
// 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
// ->
// 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
SDValue ExtOp = Src->getOperand(0);
auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
EVT EltTy = VT.getVectorElementType();
assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
"Sign extending from an invalid type");
EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
VT.getVectorElementCount() * 2);
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
ExtOp, DAG.getValueType(ExtVT));
return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
unsigned NewOpc;
unsigned MemVTOpNum = 4;
switch (Opc) {
case AArch64ISD::LD1_MERGE_ZERO:
MemVTOpNum = 3;
MemVTOpNum = 3;
MemVTOpNum = 3;
return SDValue();
EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
return SDValue();
EVT DstVT = N->getValueType(0);
SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
SmallVector<SDValue, 5> Ops;
for (unsigned I = 0; I < Src->getNumOperands(); ++I)
SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
DCI.CombineTo(N, ExtLoad);
DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
// Return N so it doesn't get rechecked
return SDValue(N, 0);
/// Legalize the gather prefetch (scalar + vector addressing mode) when the
/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
/// != nxv2i32) do not need legalization.
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
const unsigned OffsetPos = 4;
SDValue Offset = N->getOperand(OffsetPos);
// Not an unpacked vector, bail out.
if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
return SDValue();
// Extend the unpacked offset vector to 64-bit lanes.
SDLoc DL(N);
Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
// Replace the offset operand with the 64-bit one.
Ops[OffsetPos] = Offset;
return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
/// Combines a node carrying the intrinsic
/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
/// sve gather prefetch instruction with vector plus immediate addressing mode.
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
unsigned ScalarSizeInBytes) {
const unsigned ImmPos = 4, OffsetPos = 3;
// No need to combine the node if the immediate is valid...
if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
return SDValue();
// ...otherwise swap the offset base with the offset...
SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
std::swap(Ops[ImmPos], Ops[OffsetPos]);
// ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
// `aarch64_sve_prfb_gather_uxtw_index`.
SDLoc DL(N);
Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
case ISD::ADD:
case ISD::SUB:
return performAddSubLongCombine(N, DCI, DAG);
case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
return performMulCombine(N, DAG, DCI, Subtarget);
return performIntToFpCombine(N, DAG, Subtarget);
return performFpToIntCombine(N, DAG, DCI, Subtarget);
case ISD::FDIV:
return performFDivCombine(N, DAG, DCI, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
case ISD::AND:
return performANDCombine(N, DCI);
case ISD::SRL:
return performSRLCombine(N, DCI);
return performIntrinsicCombine(N, DCI, Subtarget);
return performExtendCombine(N, DCI, DAG);
return performSignExtendInRegCombine(N, DCI, DAG);
return performConcatVectorsCombine(N, DCI, DAG);
return performSelectCombine(N, DCI);
return performVSelectCombine(N, DCI.DAG);
case ISD::LOAD:
if (performTBISimplification(N->getOperand(1), DCI, DAG))
return SDValue(N, 0);
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:
case AArch64ISD::TBZ:
return performTBZCombine(N, DCI, DAG);
case AArch64ISD::CSEL:
return performCONDCombine(N, DCI, DAG, 2, 3);
case AArch64ISD::DUP:
return performPostLD1Combine(N, DCI, false);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
return performPostLD1Combine(N, DCI, true);
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
return legalizeSVEGatherPrefetchOffsVec(N, DAG);
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_ld1x2:
case Intrinsic::aarch64_neon_ld1x3:
case Intrinsic::aarch64_neon_ld1x4:
case Intrinsic::aarch64_neon_ld2lane:
case Intrinsic::aarch64_neon_ld3lane:
case Intrinsic::aarch64_neon_ld4lane:
case Intrinsic::aarch64_neon_ld2r:
case Intrinsic::aarch64_neon_ld3r:
case Intrinsic::aarch64_neon_ld4r:
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
case Intrinsic::aarch64_neon_st1x2:
case Intrinsic::aarch64_neon_st1x3:
case Intrinsic::aarch64_neon_st1x4:
case Intrinsic::aarch64_neon_st2lane:
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane:
return performNEONPostLDSTCombine(N, DCI, DAG);
case Intrinsic::aarch64_sve_ldnt1:
return performLDNT1Combine(N, DAG);
case Intrinsic::aarch64_sve_ld1rq:
return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
case Intrinsic::aarch64_sve_ld1ro:
return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ldnt1_gather:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ldnt1_gather_index:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1:
return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ldnf1:
return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ldff1:
return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
case Intrinsic::aarch64_sve_st1:
return performST1Combine(N, DAG);
case Intrinsic::aarch64_sve_stnt1:
return performSTNT1Combine(N, DAG);
case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
case Intrinsic::aarch64_sve_stnt1_scatter:
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
case Intrinsic::aarch64_sve_stnt1_scatter_index:
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
case Intrinsic::aarch64_sve_ld1_gather:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_index:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ld1_gather_sxtw:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
case Intrinsic::aarch64_sve_ld1_gather_uxtw:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
case Intrinsic::aarch64_sve_ldff1_gather:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ldff1_gather_index:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
return performGatherLoadCombine(N, DAG,
case Intrinsic::aarch64_sve_st1_scatter:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
case Intrinsic::aarch64_sve_st1_scatter_index:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
case Intrinsic::aarch64_sve_st1_scatter_sxtw:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
case Intrinsic::aarch64_sve_st1_scatter_uxtw:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
return performScatterStoreCombine(N, DAG,
case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
return performScatterStoreCombine(N, DAG,
case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
case Intrinsic::aarch64_sve_tuple_get: {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Src1 = N->getOperand(2);
SDValue Idx = N->getOperand(3);
uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
EVT ResVT = N->getValueType(0);
uint64_t NumLanes = ResVT.getVectorElementCount().Min;
SDValue Val =
DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
return DAG.getMergeValues({Val, Chain}, DL);
case Intrinsic::aarch64_sve_tuple_set: {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Tuple = N->getOperand(2);
SDValue Idx = N->getOperand(3);
SDValue Vec = N->getOperand(4);
EVT TupleVT = Tuple.getValueType();
uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;
uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;
if ((TupleLanes % NumLanes) != 0)
report_fatal_error("invalid tuple vector!");
uint64_t NumVecs = TupleLanes / NumLanes;
SmallVector<SDValue, 4> Opnds;
for (unsigned I = 0; I < NumVecs; ++I) {
if (I == IdxConst)
else {
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
DAG.getConstant(I * NumLanes, DL, MVT::i32)));
SDValue Concat =
DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
return DAG.getMergeValues({Concat, Chain}, DL);
case Intrinsic::aarch64_sve_tuple_create2:
case Intrinsic::aarch64_sve_tuple_create3:
case Intrinsic::aarch64_sve_tuple_create4: {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SmallVector<SDValue, 4> Opnds;
for (unsigned I = 2; I < N->getNumOperands(); ++I)
EVT VT = Opnds[0].getValueType();
EVT EltVT = VT.getVectorElementType();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
VT.getVectorElementCount() *
(N->getNumOperands() - 2));
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
return DAG.getMergeValues({Concat, Chain}, DL);
case Intrinsic::aarch64_sve_ld2:
case Intrinsic::aarch64_sve_ld3:
case Intrinsic::aarch64_sve_ld4: {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
SDValue Mask = N->getOperand(2);
SDValue BasePtr = N->getOperand(3);
SDValue LoadOps[] = {Chain, Mask, BasePtr};
unsigned IntrinsicID =
SDValue Result =
LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
return DAG.getMergeValues({Result, Chain}, DL);
case ISD::GlobalAddress:
return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
return SDValue();
// Check if the return value is used as only a return value, as otherwise
// we can't perform a tail-call. In particular, we need to check for
// target ISD nodes that are returns and any other "odd" constructs
// that the generic analysis code won't necessarily catch.
bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
SDValue &Chain) const {
if (N->getNumValues() != 1)
return false;
if (!N->hasNUsesOfValue(1, 0))
return false;
SDValue TCChain = Chain;
SDNode *Copy = *N->use_begin();
if (Copy->getOpcode() == ISD::CopyToReg) {
// If the copy has a glue operand, we conservatively assume it isn't safe to
// perform a tail call.
if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
return false;
TCChain = Copy->getOperand(0);
} else if (Copy->getOpcode() != ISD::FP_EXTEND)
return false;
bool HasRet = false;
for (SDNode *Node : Copy->uses()) {
if (Node->getOpcode() != AArch64ISD::RET_FLAG)
return false;
HasRet = true;
if (!HasRet)
return false;
Chain = TCChain;
return true;
// Return whether the an instruction can potentially be optimized to a tail
// call. This will cause the optimizers to attempt to move, or duplicate,
// return instructions to help enable tail call optimizations for this
// instruction.
bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return CI->isTailCall();
bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
bool &IsInc,
SelectionDAG &DAG) const {
if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
return false;
Base = Op->getOperand(0);
// All of the indexed addressing mode instructions take a signed
// 9 bit immediate offset.
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
int64_t RHSC = RHS->getSExtValue();
if (Op->getOpcode() == ISD::SUB)
RHSC = -(uint64_t)RHSC;
if (!isInt<9>(RHSC))
return false;
IsInc = (Op->getOpcode() == ISD::ADD);
Offset = Op->getOperand(1);
return true;
return false;
bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const {
SDValue Ptr;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
} else
return false;
bool IsInc;
if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
return false;
return true;
bool AArch64TargetLowering::getPostIndexedAddressParts(
SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
SDValue Ptr;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
} else
return false;
bool IsInc;
if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
return false;
// Post-indexing updates the base, so it's not a valid transform
// if that's not the same as the load's pointer.
if (Ptr != Base)
return false;
return true;
static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Op = N->getOperand(0);
if (N->getValueType(0) != MVT::i16 ||
(Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
Op = SDValue(
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
DAG.getUNDEF(MVT::i32), Op,
DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
static void ReplaceReductionResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG, unsigned InterOp,
unsigned AcrossOp) {
SDValue Lo, Hi;
SDLoc dl(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
DAG.getNode(ISD::SRL, DL, MVT::i128, N,
DAG.getConstant(64, DL, MVT::i64)));
return std::make_pair(Lo, Hi);
void AArch64TargetLowering::ReplaceExtractSubVectorResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
// Common code will handle these just fine.
if (!InVT.isScalableVector() || !InVT.isInteger())
SDLoc DL(N);
EVT VT = N->getValueType(0);
// The following checks bail if this is not a halving operation.
ElementCount ResEC = VT.getVectorElementCount();
if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!CIndex)
unsigned Index = CIndex->getZExtValue();
if ((Index != 0) && (Index != ResEC.Min))
unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
// Create an even/odd pair of X registers holding integer value V.
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
SDLoc dl(V.getNode());
SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
SDValue VHi = DAG.getAnyExtOrTrunc(
DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
dl, MVT::i64);
if (DAG.getDataLayout().isBigEndian())
std::swap (VLo, VHi);
SDValue RegClass =
DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
return SDValue(
DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
static void ReplaceCMP_SWAP_128Results(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
assert(N->getValueType(0) == MVT::i128 &&
"AtomicCmpSwap on types less than 128 should be legal");
if (Subtarget->hasLSE()) {
// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
SDValue Ops[] = {
createGPRPairNode(DAG, N->getOperand(2)), // Compare value
createGPRPairNode(DAG, N->getOperand(3)), // Store value
N->getOperand(1), // Ptr
N->getOperand(0), // Chain in
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
unsigned Opcode;
switch (MemOp->getOrdering()) {
case AtomicOrdering::Monotonic:
Opcode = AArch64::CASPX;
case AtomicOrdering::Acquire:
Opcode = AArch64::CASPAX;
case AtomicOrdering::Release:
Opcode = AArch64::CASPLX;
case AtomicOrdering::AcquireRelease:
case AtomicOrdering::SequentiallyConsistent:
Opcode = AArch64::CASPALX;
llvm_unreachable("Unexpected ordering!");
MachineSDNode *CmpSwap = DAG.getMachineNode(
Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
DAG.setNodeMemRefs(CmpSwap, {MemOp});
unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
if (DAG.getDataLayout().isBigEndian())
std::swap(SubReg1, SubReg2);
SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
SDValue(CmpSwap, 0));
SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
SDValue(CmpSwap, 0));
DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
Results.push_back(SDValue(CmpSwap, 1)); // Chain out
auto Desired = splitInt128(N->getOperand(2), DAG);
auto New = splitInt128(N->getOperand(3), DAG);
SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
New.first, New.second, N->getOperand(0)};
SDNode *CmpSwap = DAG.getMachineNode(
AArch64::CMP_SWAP_128, SDLoc(N),
DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
Results.push_back(SDValue(CmpSwap, 3));
void AArch64TargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
llvm_unreachable("Don't know how to custom expand this");
ReplaceBITCASTResults(N, Results, DAG);
Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
case ISD::CTPOP:
Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
case AArch64ISD::SADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
case AArch64ISD::UADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
case AArch64ISD::SMINV:
ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
case AArch64ISD::UMINV:
ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
case AArch64ISD::SMAXV:
ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
case AArch64ISD::UMAXV:
ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
// Let normal code take care of it by not adding anything to Results.
ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
case ISD::LOAD: {
assert(SDValue(N, 0).getValueType() == MVT::i128 &&
"unexpected load's value type");
LoadSDNode *LoadNode = cast<LoadSDNode>(N);
if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
// Non-volatile loads are optimized later in AArch64's load/store
// optimizer.
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::LDP, SDLoc(N),
DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
{LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
Result.getValue(0), Result.getValue(1));
Results.append({Pair, Result.getValue(2) /* Chain */});
ReplaceExtractSubVectorResults(N, Results, DAG);
EVT VT = N->getValueType(0);
assert((VT == MVT::i8 || VT == MVT::i16) &&
"custom lowering for unexpected type");
ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
switch (IntID) {
case Intrinsic::aarch64_sve_clasta_n: {
SDLoc DL(N);
auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
N->getOperand(1), Op2, N->getOperand(3));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
case Intrinsic::aarch64_sve_clastb_n: {
SDLoc DL(N);
auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
N->getOperand(1), Op2, N->getOperand(3));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
case Intrinsic::aarch64_sve_lasta: {
SDLoc DL(N);
auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
N->getOperand(1), N->getOperand(2));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
case Intrinsic::aarch64_sve_lastb: {
SDLoc DL(N);
auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
N->getOperand(1), N->getOperand(2));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
bool AArch64TargetLowering::useLoadStackGuardNode() const {
if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
return TargetLowering::useLoadStackGuardNode();
return true;
unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal if there are three or more FDIVs.
return 3;
AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
// v4i16, v2i32 instead of to promote.
if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
VT == MVT::v1f32)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
return Size == 128;
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
if (AI->isFloatingPointOperation())
return AtomicExpansionKind::CmpXChg;
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
if (Size > 128) return AtomicExpansionKind::None;
// Nand not supported in LSE.
if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
// Leave 128 bits to LLSC.
return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
AtomicCmpXchgInst *AI) const {
// If subtarget has LSE, leave cmpxchg intact for codegen.
if (Subtarget->hasLSE())
return AtomicExpansionKind::None;
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
// implement cmpxchg without spilling. If the address being exchanged is also
// on the stack and close enough to the spill slot, this can lead to a
// situation where the monitor always gets cleared and the atomic operation
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return AtomicExpansionKind::None;
return AtomicExpansionKind::LLSC;
Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
bool IsAcquire = isAcquireOrStronger(Ord);
// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i64, i64} and we have to recombine them into a
// single i128 here.
if (ValTy->getPrimitiveSizeInBits() == 128) {
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
Function *Ldxr = Intrinsic::getDeclaration(M, Int);
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
return Builder.CreateOr(
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
Type *Tys[] = { Addr->getType() };
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
const DataLayout &DL = M->getDataLayout();
IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
return Builder.CreateBitCast(Trunc, EltTy);
void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
IRBuilder<> &Builder) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
bool IsRelease = isReleaseOrStronger(Ord);
// Since the intrinsics must have legal type, the i128 intrinsics take two
// parameters: "i64, i64". We must marshal Val into the appropriate form
// before the call.
if (Val->getType()->getPrimitiveSizeInBits() == 128) {
Intrinsic::ID Int =
IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
Function *Stxr = Intrinsic::getDeclaration(M, Int);
Type *Int64Ty = Type::getInt64Ty(M->getContext());
Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
Intrinsic::ID Int =
IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
Type *Tys[] = { Addr->getType() };
Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
const DataLayout &DL = M->getDataLayout();
IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
Val = Builder.CreateBitCast(Val, IntValTy);
return Builder.CreateCall(Stxr,
Val, Stxr->getFunctionType()->getParamType(0)),
bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
return Ty->isArrayTy();
bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
EVT) const {
return false;
static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
Function *ThreadPointerFunc =
Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
return IRB.CreatePointerCast(
IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
// Android provides a fixed TLS slot for the stack cookie. See the definition
if (Subtarget->isTargetAndroid())
return UseTlsOffset(IRB, 0x28);
// Fuchsia is similar.
// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
if (Subtarget->isTargetFuchsia())
return UseTlsOffset(IRB, -0x10);
return TargetLowering::getIRStackGuard(IRB);
void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
// MSVC CRT has a global variable holding security cookie.
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
"__security_check_cookie", Type::getVoidTy(M.getContext()),
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
F->addAttribute(1, Attribute::AttrKind::InReg);
Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
// MSVC CRT has a global variable holding security cookie.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return M.getGlobalVariable("__security_cookie");
return TargetLowering::getSDagStackGuard(M);
Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
return M.getFunction("__security_check_cookie");
return TargetLowering::getSSPStackGuardCheck(M);
Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
if (Subtarget->isTargetAndroid())
return UseTlsOffset(IRB, 0x48);
// Fuchsia is similar.
// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
if (Subtarget->isTargetFuchsia())
return UseTlsOffset(IRB, -0x8);
return TargetLowering::getSafeStackPointerLocation(IRB);
bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
// Only sink 'and' mask to cmp use block if it is masking a single bit, since
// this is likely to be fold the and/cmp/br into a single tbz instruction. It
// may be beneficial to sink in other cases, but we would have to check that
// the cmp would not get folded into the br to form a cbz for these to be
// beneficial.
ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
if (!Mask)
return false;
return Mask->getValue().isPowerOf2();
bool AArch64TargetLowering::
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
SelectionDAG &DAG) const {
// Does baseline recommend not to perform the fold by default?
if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
return false;
// Else, if this is a vector shift, prefer 'shl'.
return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
SDNode *N) const {
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
!Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
return false;
return true;
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in AArch64unctionInfo.
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
void AArch64TargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (AArch64::GPR64RegClass.contains(*I))
RC = &AArch64::GPR64RegClass;
else if (AArch64::FPR64RegClass.contains(*I))
RC = &AArch64::FPR64RegClass;
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
// Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// Integer division on AArch64 is expensive. However, when aggressively
// optimizing for code size, we prefer to use a div instruction, as it is
// usually smaller than the alternative sequence.
// The exception to this is vector division. Since AArch64 doesn't have vector
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
return OptSize && !VT.isVector();
bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
// We want inc-of-add for scalars and sub-of-not for vectors.
return VT.isScalarInteger();
bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
return getPointerTy(DL).getSizeInBits();
return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
// Unlike X86, we let frame lowering assign offsets to all catch objects.
bool AArch64TargetLowering::needsFixedCatchObjects() const {
return false;
bool AArch64TargetLowering::shouldLocalize(
const MachineInstr &MI, const TargetTransformInfo *TTI) const {
switch (MI.getOpcode()) {
case TargetOpcode::G_GLOBAL_VALUE: {
// On Darwin, TLS global vars get selected into function calls, which
// we don't want localized, as they can get moved into the middle of a
// another call sequence.
const GlobalValue &GV = *MI.getOperand(1).getGlobal();
if (GV.isThreadLocal() && Subtarget->isTargetMachO())
return false;
// If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
// localizable.
case AArch64::ADRP:
case AArch64::G_ADD_LOW:
return true;
return TargetLoweringBase::shouldLocalize(MI, TTI);
bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
if (isa<ScalableVectorType>(Inst.getType()))
return true;
for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
return true;
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+ if (isa<ScalableVectorType>(AI->getAllocatedType()))
+ return true;
+ }
return false;
// Return the largest legal scalable vector type that matches VT's element type.
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
assert(VT.isFixedLengthVector() &&
DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
"Expected legal fixed length vector!");
switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
llvm_unreachable("unexpected element type for SVE container");
case MVT::i8:
return EVT(MVT::nxv16i8);
case MVT::i16:
return EVT(MVT::nxv8i16);
case MVT::i32:
return EVT(MVT::nxv4i32);
case MVT::i64:
return EVT(MVT::nxv2i64);
case MVT::f16:
return EVT(MVT::nxv8f16);
case MVT::f32:
return EVT(MVT::nxv4f32);
case MVT::f64:
return EVT(MVT::nxv2f64);
// Return a PTRUE with active lanes corresponding to the extent of VT.
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
assert(VT.isFixedLengthVector() &&
DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
"Expected legal fixed length vector!");
int PgPattern;
switch (VT.getVectorNumElements()) {
llvm_unreachable("unexpected element count for SVE predicate");
case 1:
PgPattern = AArch64SVEPredPattern::vl1;
case 2:
PgPattern = AArch64SVEPredPattern::vl2;
case 4:
PgPattern = AArch64SVEPredPattern::vl4;
case 8:
PgPattern = AArch64SVEPredPattern::vl8;
case 16:
PgPattern = AArch64SVEPredPattern::vl16;
case 32:
PgPattern = AArch64SVEPredPattern::vl32;
case 64:
PgPattern = AArch64SVEPredPattern::vl64;
case 128:
PgPattern = AArch64SVEPredPattern::vl128;
case 256:
PgPattern = AArch64SVEPredPattern::vl256;
// TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
// use AArch64SVEPredPattern::all, which can enable the use of unpredicated
// variants of instructions when available.
switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
llvm_unreachable("unexpected element type for SVE predicate");
case MVT::i8:
MaskVT = MVT::nxv16i1;
case MVT::i16:
case MVT::f16:
MaskVT = MVT::nxv8i1;
case MVT::i32:
case MVT::f32:
MaskVT = MVT::nxv4i1;
case MVT::i64:
case MVT::f64:
MaskVT = MVT::nxv2i1;
return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
DAG.getTargetConstant(PgPattern, DL, MVT::i64));
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
"Expected legal scalable vector!");
auto PredTy = VT.changeVectorElementType(MVT::i1);
return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
if (VT.isFixedLengthVector())
return getPredicateForFixedLengthVector(DAG, DL, VT);
return getPredicateForScalableVector(DAG, DL, VT);
// Grow V to consume an entire SVE register.
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
assert(VT.isScalableVector() &&
"Expected to convert into a scalable vector!");
assert(V.getValueType().isFixedLengthVector() &&
"Expected a fixed length vector operand!");
SDLoc DL(V);
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
// Shrink V so it's just big enough to maintain a VT's worth of data.
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
assert(VT.isFixedLengthVector() &&
"Expected to convert into a fixed length vector!");
assert(V.getValueType().isScalableVector() &&
"Expected a scalable vector operand!");
SDLoc DL(V);
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
// Convert all fixed length vector loads larger than NEON to masked_loads.
SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
SDValue Op, SelectionDAG &DAG) const {
auto Load = cast<LoadSDNode>(Op);
SDLoc DL(Op);
EVT VT = Op.getValueType();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
auto NewLoad = DAG.getMaskedLoad(
ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
auto Result = convertFromScalableVector(DAG, VT, NewLoad);
SDValue MergedValues[2] = {Result, Load->getChain()};
return DAG.getMergeValues(MergedValues, DL);
// Convert all fixed length vector stores larger than NEON to masked_stores.
SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
SDValue Op, SelectionDAG &DAG) const {
auto Store = cast<StoreSDNode>(Op);
SDLoc DL(Op);
EVT VT = Store->getValue().getValueType();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
return DAG.getMaskedStore(
Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
Store->getMemOperand(), Store->getAddressingMode(),
SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
SDLoc DL(Op);
SDValue Val = Op.getOperand(0);
EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
Val = convertToScalableVector(DAG, ContainerVT, Val);
// Repeatedly truncate Val until the result is of the desired element type.
switch (ContainerVT.getSimpleVT().SimpleTy) {
llvm_unreachable("unimplemented container type");
case MVT::nxv2i64:
Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
if (VT.getVectorElementType() == MVT::i32)
case MVT::nxv4i32:
Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
if (VT.getVectorElementType() == MVT::i16)
case MVT::nxv8i16:
Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
return convertFromScalableVector(DAG, VT, Val);
SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
SelectionDAG &DAG,
unsigned NewOp) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
auto Pg = getPredicateForVector(DAG, DL, VT);
if (useSVEForFixedLengthVectorVT(VT)) {
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
// Create list of operands by convereting existing ones to scalable types.
SmallVector<SDValue, 4> Operands = {Pg};
for (const SDValue &V : Op->op_values()) {
if (isa<CondCodeSDNode>(V)) {
assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
"Only fixed length vectors are supported!");
Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
return convertFromScalableVector(DAG, VT, ScalableRes);
assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
SmallVector<SDValue, 4> Operands = {Pg};
for (const SDValue &V : Op->op_values()) {
assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
"Only scalable vectors are supported!");
return DAG.getNode(NewOp, DL, VT, Operands);
diff --git a/llvm/lib/Target/AArch64/ b/llvm/lib/Target/AArch64/
index 6df7970f4d82..4f4ba692c2db 100644
--- a/llvm/lib/Target/AArch64/
+++ b/llvm/lib/Target/AArch64/
@@ -1,11233 +1,11236 @@
//===- - AArch64 Instruction Formats --*- tblgen -*-===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Describe AArch64 instructions format here
// Format specifies the encoding used by the instruction. This is part of the
// ad-hoc solution used to emit machine instruction encodings by our machine
// code emitter.
class Format<bits<2> val> {
bits<2> Value = val;
def PseudoFrm : Format<0>;
def NormalFrm : Format<1>; // Do we need any others?
// Enum describing whether an instruction is
// destructive in its first source operand.
class DestructiveInstTypeEnum<bits<4> val> {
bits<4> Value = val;
def NotDestructive : DestructiveInstTypeEnum<0>;
// Destructive in its first operand and can be MOVPRFX'd, but has no other
// special properties.
def DestructiveOther : DestructiveInstTypeEnum<1>;
def DestructiveUnary : DestructiveInstTypeEnum<2>;
def DestructiveBinaryImm : DestructiveInstTypeEnum<3>;
def DestructiveBinaryShImmUnpred : DestructiveInstTypeEnum<4>;
def DestructiveBinary : DestructiveInstTypeEnum<5>;
def DestructiveBinaryComm : DestructiveInstTypeEnum<6>;
def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>;
def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>;
class FalseLanesEnum<bits<2> val> {
bits<2> Value = val;
def FalseLanesNone : FalseLanesEnum<0>;
def FalseLanesZero : FalseLanesEnum<1>;
def FalseLanesUndef : FalseLanesEnum<2>;
// AArch64 Instruction Format
class AArch64Inst<Format f, string cstr> : Instruction {
field bits<32> Inst; // Instruction encoding.
// Mask of bits that cause an encoding to be UNPREDICTABLE.
// If a bit is set, then if the corresponding bit in the
// target encoding differs from its value in the "Inst" field,
// the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
field bits<32> Unpredictable = 0;
// SoftFail is the generic name for this field, but we alias it so
// as to make it more obvious what it means in ARM-land.
field bits<32> SoftFail = Unpredictable;
let Namespace = "AArch64";
Format F = f;
bits<2> Form = F.Value;
// Defaults
FalseLanesEnum FalseLanes = FalseLanesNone;
DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
ElementSizeEnum ElementSize = ElementSizeNone;
let TSFlags{8-7} = FalseLanes.Value;
let TSFlags{6-3} = DestructiveInstType.Value;
let TSFlags{2-0} = ElementSize.Value;
let Pattern = [];
let Constraints = cstr;
class InstSubst<string Asm, dag Result, bit EmitPriority = 0>
: InstAlias<Asm, Result, EmitPriority>, Requires<[UseNegativeImmediates]>;
// Pseudo instructions (don't have encoding information)
class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
: AArch64Inst<PseudoFrm, cstr> {
dag OutOperandList = oops;
dag InOperandList = iops;
let Pattern = pattern;
let isCodeGenOnly = 1;
let isPseudo = 1;
// Real instructions (have encoding information)
class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
let Pattern = pattern;
let Size = 4;
// Normal instructions
class I<dag oops, dag iops, string asm, string operands, string cstr,
list<dag> pattern>
: EncodedI<cstr, pattern> {
dag OutOperandList = oops;
dag InOperandList = iops;
let AsmString = !strconcat(asm, operands);
class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
class UnOpFrag<dag res> : PatFrag<(ops node:$LHS), res>;
// Helper fragment for an extract of the high portion of a 128-bit vector.
def extract_high_v16i8 :
UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
def extract_high_v8i16 :
UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
def extract_high_v4i32 :
UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
def extract_high_v2i64 :
UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
// Asm Operand Classes.
// Shifter operand for arithmetic shifted encodings.
def ShifterOperand : AsmOperandClass {
let Name = "Shifter";
// Shifter operand for mov immediate encodings.
def MovImm32ShifterOperand : AsmOperandClass {
let SuperClasses = [ShifterOperand];
let Name = "MovImm32Shifter";
let RenderMethod = "addShifterOperands";
let DiagnosticType = "InvalidMovImm32Shift";
def MovImm64ShifterOperand : AsmOperandClass {
let SuperClasses = [ShifterOperand];
let Name = "MovImm64Shifter";
let RenderMethod = "addShifterOperands";
let DiagnosticType = "InvalidMovImm64Shift";
// Shifter operand for arithmetic register shifted encodings.
class ArithmeticShifterOperand<int width> : AsmOperandClass {
let SuperClasses = [ShifterOperand];
let Name = "ArithmeticShifter" # width;
let PredicateMethod = "isArithmeticShifter<" # width # ">";
let RenderMethod = "addShifterOperands";
let DiagnosticType = "AddSubRegShift" # width;
def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
// Shifter operand for logical register shifted encodings.
class LogicalShifterOperand<int width> : AsmOperandClass {
let SuperClasses = [ShifterOperand];
let Name = "LogicalShifter" # width;
let PredicateMethod = "isLogicalShifter<" # width # ">";
let RenderMethod = "addShifterOperands";
let DiagnosticType = "AddSubRegShift" # width;
def LogicalShifterOperand32 : LogicalShifterOperand<32>;
def LogicalShifterOperand64 : LogicalShifterOperand<64>;
// Shifter operand for logical vector 128/64-bit shifted encodings.
def LogicalVecShifterOperand : AsmOperandClass {
let SuperClasses = [ShifterOperand];
let Name = "LogicalVecShifter";
let RenderMethod = "addShifterOperands";
def LogicalVecHalfWordShifterOperand : AsmOperandClass {
let SuperClasses = [LogicalVecShifterOperand];
let Name = "LogicalVecHalfWordShifter";
let RenderMethod = "addShifterOperands";
// The "MSL" shifter on the vector MOVI instruction.
def MoveVecShifterOperand : AsmOperandClass {
let SuperClasses = [ShifterOperand];
let Name = "MoveVecShifter";
let RenderMethod = "addShifterOperands";
// Extend operand for arithmetic encodings.
def ExtendOperand : AsmOperandClass {
let Name = "Extend";
let DiagnosticType = "AddSubRegExtendLarge";
def ExtendOperand64 : AsmOperandClass {
let SuperClasses = [ExtendOperand];
let Name = "Extend64";
let DiagnosticType = "AddSubRegExtendSmall";
// 'extend' that's a lsl of a 64-bit register.
def ExtendOperandLSL64 : AsmOperandClass {
let SuperClasses = [ExtendOperand];
let Name = "ExtendLSL64";
let RenderMethod = "addExtend64Operands";
let DiagnosticType = "AddSubRegExtendLarge";
// 8-bit floating-point immediate encodings.
def FPImmOperand : AsmOperandClass {
let Name = "FPImm";
let ParserMethod = "tryParseFPImm<true>";
let DiagnosticType = "InvalidFPImm";
def CondCode : AsmOperandClass {
let Name = "CondCode";
let DiagnosticType = "InvalidCondCode";
// A 32-bit register pasrsed as 64-bit
def GPR32as64Operand : AsmOperandClass {
let Name = "GPR32as64";
let ParserMethod =
"tryParseGPROperand<false, RegConstraintEqualityTy::EqualsSubReg>";
def GPR32as64 : RegisterOperand<GPR32> {
let ParserMatchClass = GPR32as64Operand;
// A 64-bit register pasrsed as 32-bit
def GPR64as32Operand : AsmOperandClass {
let Name = "GPR64as32";
let ParserMethod =
"tryParseGPROperand<false, RegConstraintEqualityTy::EqualsSuperReg>";
def GPR64as32 : RegisterOperand<GPR64, "printGPR64as32"> {
let ParserMatchClass = GPR64as32Operand;
// 8-bit immediate for AdvSIMD where 64-bit values of the form:
// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
// are encoded as the eight bit value 'abcdefgh'.
def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
class UImmScaledMemoryIndexed<int Width, int Scale> : AsmOperandClass {
let Name = "UImm" # Width # "s" # Scale;
let DiagnosticType = "InvalidMemoryIndexed" # Scale # "UImm" # Width;
let RenderMethod = "addImmScaledOperands<" # Scale # ">";
let PredicateMethod = "isUImmScaled<" # Width # ", " # Scale # ">";
class SImmScaledMemoryIndexed<int Width, int Scale> : AsmOperandClass {
let Name = "SImm" # Width # "s" # Scale;
let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm" # Width;
let RenderMethod = "addImmScaledOperands<" # Scale # ">";
let PredicateMethod = "isSImmScaled<" # Width # ", " # Scale # ">";
// Operand Definitions.
// ADR[P] instruction labels.
def AdrpOperand : AsmOperandClass {
let Name = "AdrpLabel";
let ParserMethod = "tryParseAdrpLabel";
let DiagnosticType = "InvalidLabel";
def adrplabel : Operand<i64> {
let EncoderMethod = "getAdrLabelOpValue";
let PrintMethod = "printAdrpLabel";
let ParserMatchClass = AdrpOperand;
def AdrOperand : AsmOperandClass {
let Name = "AdrLabel";
let ParserMethod = "tryParseAdrLabel";
let DiagnosticType = "InvalidLabel";
def adrlabel : Operand<i64> {
let EncoderMethod = "getAdrLabelOpValue";
let ParserMatchClass = AdrOperand;
class SImmOperand<int width> : AsmOperandClass {
let Name = "SImm" # width;
let DiagnosticType = "InvalidMemoryIndexedSImm" # width;
let RenderMethod = "addImmOperands";
let PredicateMethod = "isSImm<" # width # ">";
class AsmImmRange<int Low, int High> : AsmOperandClass {
let Name = "Imm" # Low # "_" # High;
let DiagnosticType = "InvalidImm" # Low # "_" # High;
let RenderMethod = "addImmOperands";
let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
// Authenticated loads for v8.3 can have scaled 10-bit immediate offsets.
def SImm10s8Operand : SImmScaledMemoryIndexed<10, 8>;
def simm10Scaled : Operand<i64> {
let ParserMatchClass = SImm10s8Operand;
let DecoderMethod = "DecodeSImm<10>";
let PrintMethod = "printImmScale<8>";
def simm9s16 : Operand<i64> {
let ParserMatchClass = SImmScaledMemoryIndexed<9, 16>;
let DecoderMethod = "DecodeSImm<9>";
let PrintMethod = "printImmScale<16>";
// uimm6 predicate - True if the immediate is in the range [0, 63].
def UImm6Operand : AsmOperandClass {
let Name = "UImm6";
let DiagnosticType = "InvalidImm0_63";
def uimm6 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
let ParserMatchClass = UImm6Operand;
def uimm16 : Operand<i16>, ImmLeaf<i16, [{return Imm >= 0 && Imm < 65536;}]>{
let ParserMatchClass = AsmImmRange<0, 65535>;
def SImm9Operand : SImmOperand<9>;
def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
let ParserMatchClass = SImm9Operand;
let DecoderMethod = "DecodeSImm<9>";
def SImm8Operand : SImmOperand<8>;
def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 127; }]> {
let ParserMatchClass = SImm8Operand;
let DecoderMethod = "DecodeSImm<8>";
def SImm6Operand : SImmOperand<6>;
def simm6_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -32 && Imm < 32; }]> {
let ParserMatchClass = SImm6Operand;
let DecoderMethod = "DecodeSImm<6>";
def SImm5Operand : SImmOperand<5>;
def simm5_64b : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -16 && Imm < 16; }]> {
let ParserMatchClass = SImm5Operand;
let DecoderMethod = "DecodeSImm<5>";
def simm5_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -16 && Imm < 16; }]> {
let ParserMatchClass = SImm5Operand;
let DecoderMethod = "DecodeSImm<5>";
def simm5_8b : Operand<i32>, ImmLeaf<i32, [{ return (int8_t)Imm >= -16 && (int8_t)Imm < 16; }]> {
let ParserMatchClass = SImm5Operand;
let DecoderMethod = "DecodeSImm<5>";
let PrintMethod = "printSImm<8>";
def simm5_16b : Operand<i32>, ImmLeaf<i32, [{ return (int16_t)Imm >= -16 && (int16_t)Imm < 16; }]> {
let ParserMatchClass = SImm5Operand;
let DecoderMethod = "DecodeSImm<5>";
let PrintMethod = "printSImm<16>";
// simm7sN predicate - True if the immediate is a multiple of N in the range
// [-64 * N, 63 * N].
def SImm7s4Operand : SImmScaledMemoryIndexed<7, 4>;
def SImm7s8Operand : SImmScaledMemoryIndexed<7, 8>;
def SImm7s16Operand : SImmScaledMemoryIndexed<7, 16>;
def simm7s4 : Operand<i32> {
let ParserMatchClass = SImm7s4Operand;
let PrintMethod = "printImmScale<4>";
def simm7s8 : Operand<i32> {
let ParserMatchClass = SImm7s8Operand;
let PrintMethod = "printImmScale<8>";
def simm7s16 : Operand<i32> {
let ParserMatchClass = SImm7s16Operand;
let PrintMethod = "printImmScale<16>";
def am_sve_fi : ComplexPattern<i64, 2, "SelectAddrModeFrameIndexSVE", []>;
def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
def UImmS1XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
def UImmS2XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
def UImmS4XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64);
def UImmS8XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64);
// uimm5sN predicate - True if the immediate is a multiple of N in the range
// [0 * N, 32 * N].
def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>;
def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>;
def uimm5s2 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
UImmS2XForm> {
let ParserMatchClass = UImm5s2Operand;
let PrintMethod = "printImmScale<2>";
def uimm5s4 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
UImmS4XForm> {
let ParserMatchClass = UImm5s4Operand;
let PrintMethod = "printImmScale<4>";
def uimm5s8 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
UImmS8XForm> {
let ParserMatchClass = UImm5s8Operand;
let PrintMethod = "printImmScale<8>";
// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant)
// instead of ImmLeaf (Constant)
def tuimm5s2 : Operand<i64>, TImmLeaf<i64,
[{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
UImmS2XForm> {
let ParserMatchClass = UImm5s2Operand;
let PrintMethod = "printImmScale<2>";
def tuimm5s4 : Operand<i64>, TImmLeaf<i64,
[{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
UImmS4XForm> {
let ParserMatchClass = UImm5s4Operand;
let PrintMethod = "printImmScale<4>";
def tuimm5s8 : Operand<i64>, TImmLeaf<i64,
[{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
UImmS8XForm> {
let ParserMatchClass = UImm5s8Operand;
let PrintMethod = "printImmScale<8>";
// uimm6sN predicate - True if the immediate is a multiple of N in the range
// [0 * N, 64 * N].
def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>;
def UImm6s2Operand : UImmScaledMemoryIndexed<6, 2>;
def UImm6s4Operand : UImmScaledMemoryIndexed<6, 4>;
def UImm6s8Operand : UImmScaledMemoryIndexed<6, 8>;
def UImm6s16Operand : UImmScaledMemoryIndexed<6, 16>;
def uimm6s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
let ParserMatchClass = UImm6s1Operand;
def uimm6s2 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >= 0 && Imm < (64*2) && ((Imm % 2) == 0); }]> {
let PrintMethod = "printImmScale<2>";
let ParserMatchClass = UImm6s2Operand;
def uimm6s4 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >= 0 && Imm < (64*4) && ((Imm % 4) == 0); }]> {
let PrintMethod = "printImmScale<4>";
let ParserMatchClass = UImm6s4Operand;
def uimm6s8 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >= 0 && Imm < (64*8) && ((Imm % 8) == 0); }]> {
let PrintMethod = "printImmScale<8>";
let ParserMatchClass = UImm6s8Operand;
def uimm6s16 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >= 0 && Imm < (64*16) && ((Imm % 16) == 0); }]> {
let PrintMethod = "printImmScale<16>";
let ParserMatchClass = UImm6s16Operand;
def SImmS2XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64);
def SImmS3XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64);
def SImmS4XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64);
def SImmS16XForm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64);
+def SImmS32XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64);
// simm6sN predicate - True if the immediate is a multiple of N in the range
// [-32 * N, 31 * N].
def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>;
def simm6s1 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -32 && Imm < 32; }]> {
let ParserMatchClass = SImm6s1Operand;
let DecoderMethod = "DecodeSImm<6>";
// simm4sN predicate - True if the immediate is a multiple of N in the range
// [ -8* N, 7 * N].
def SImm4s1Operand : SImmScaledMemoryIndexed<4, 1>;
def SImm4s2Operand : SImmScaledMemoryIndexed<4, 2>;
def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>;
def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>;
def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>;
def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>;
def simm4s1 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >=-8 && Imm <= 7; }]> {
let ParserMatchClass = SImm4s1Operand;
let DecoderMethod = "DecodeSImm<4>";
def simm4s2 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> {
let PrintMethod = "printImmScale<2>";
let ParserMatchClass = SImm4s2Operand;
let DecoderMethod = "DecodeSImm<4>";
def simm4s3 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> {
let PrintMethod = "printImmScale<3>";
let ParserMatchClass = SImm4s3Operand;
let DecoderMethod = "DecodeSImm<4>";
def simm4s4 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> {
let PrintMethod = "printImmScale<4>";
let ParserMatchClass = SImm4s4Operand;
let DecoderMethod = "DecodeSImm<4>";
def simm4s16 : Operand<i64>, ImmLeaf<i64,
[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> {
let PrintMethod = "printImmScale<16>";
let ParserMatchClass = SImm4s16Operand;
let DecoderMethod = "DecodeSImm<4>";
def simm4s32 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> {
+[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> {
let PrintMethod = "printImmScale<32>";
let ParserMatchClass = SImm4s32Operand;
let DecoderMethod = "DecodeSImm<4>";
def Imm1_8Operand : AsmImmRange<1, 8>;
def Imm1_16Operand : AsmImmRange<1, 16>;
def Imm1_32Operand : AsmImmRange<1, 32>;
def Imm1_64Operand : AsmImmRange<1, 64>;
class BranchTarget<int N> : AsmOperandClass {
let Name = "BranchTarget" # N;
let DiagnosticType = "InvalidLabel";
let PredicateMethod = "isBranchTarget<" # N # ">";
class PCRelLabel<int N> : BranchTarget<N> {
let Name = "PCRelLabel" # N;
def BranchTarget14Operand : BranchTarget<14>;
def BranchTarget26Operand : BranchTarget<26>;
def PCRelLabel19Operand : PCRelLabel<19>;
def MovWSymbolG3AsmOperand : AsmOperandClass {
let Name = "MovWSymbolG3";
let RenderMethod = "addImmOperands";
def movw_symbol_g3 : Operand<i32> {
let ParserMatchClass = MovWSymbolG3AsmOperand;
def MovWSymbolG2AsmOperand : AsmOperandClass {
let Name = "MovWSymbolG2";
let RenderMethod = "addImmOperands";
def movw_symbol_g2 : Operand<i32> {
let ParserMatchClass = MovWSymbolG2AsmOperand;
def MovWSymbolG1AsmOperand : AsmOperandClass {
let Name = "MovWSymbolG1";
let RenderMethod = "addImmOperands";
def movw_symbol_g1 : Operand<i32> {
let ParserMatchClass = MovWSymbolG1AsmOperand;
def MovWSymbolG0AsmOperand : AsmOperandClass {
let Name = "MovWSymbolG0";
let RenderMethod = "addImmOperands";
def movw_symbol_g0 : Operand<i32> {
let ParserMatchClass = MovWSymbolG0AsmOperand;
class fixedpoint_i32<ValueType FloatVT>
: Operand<FloatVT>,
ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
let EncoderMethod = "getFixedPointScaleOpValue";
let DecoderMethod = "DecodeFixedPointScaleImm32";
let ParserMatchClass = Imm1_32Operand;
class fixedpoint_i64<ValueType FloatVT>
: Operand<FloatVT>,
ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
let EncoderMethod = "getFixedPointScaleOpValue";
let DecoderMethod = "DecodeFixedPointScaleImm64";
let ParserMatchClass = Imm1_64Operand;
def fixedpoint_f16_i32 : fixedpoint_i32<f16>;
def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
def fixedpoint_f16_i64 : fixedpoint_i64<f16>;
def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
}]> {
let EncoderMethod = "getVecShiftR8OpValue";
let DecoderMethod = "DecodeVecShiftR8Imm";
let ParserMatchClass = Imm1_8Operand;
def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
}]> {
let EncoderMethod = "getVecShiftR16OpValue";
let DecoderMethod = "DecodeVecShiftR16Imm";
let ParserMatchClass = Imm1_16Operand;
def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
}]> {
let EncoderMethod = "getVecShiftR16OpValue";
let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
let ParserMatchClass = Imm1_8Operand;
def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
}]> {
let EncoderMethod = "getVecShiftR32OpValue";
let DecoderMethod = "DecodeVecShiftR32Imm";
let ParserMatchClass = Imm1_32Operand;
def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
}]> {
let EncoderMethod = "getVecShiftR32OpValue";
let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
let ParserMatchClass = Imm1_16Operand;
def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
}]> {
let EncoderMethod = "getVecShiftR64OpValue";
let DecoderMethod = "DecodeVecShiftR64Imm";
let ParserMatchClass = Imm1_64Operand;
def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
}]> {
let EncoderMethod = "getVecShiftR64OpValue";
let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
let ParserMatchClass = Imm1_32Operand;
// Same as vecshiftR#N, but use TargetConstant (TimmLeaf) instead of Constant
// (ImmLeaf)
def tvecshiftR8 : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
}]> {
let EncoderMethod = "getVecShiftR8OpValue";
let DecoderMethod = "DecodeVecShiftR8Imm";
let ParserMatchClass = Imm1_8Operand;
def tvecshiftR16 : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
}]> {
let EncoderMethod = "getVecShiftR16OpValue";
let DecoderMethod = "DecodeVecShiftR16Imm";
let ParserMatchClass = Imm1_16Operand;
def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
}]> {
let EncoderMethod = "getVecShiftR32OpValue";
let DecoderMethod = "DecodeVecShiftR32Imm";
let ParserMatchClass = Imm1_32Operand;
def tvecshiftR64 : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
}]> {
let EncoderMethod = "getVecShiftR64OpValue";
let DecoderMethod = "DecodeVecShiftR64Imm";
let ParserMatchClass = Imm1_64Operand;
def Imm0_1Operand : AsmImmRange<0, 1>;
def Imm0_7Operand : AsmImmRange<0, 7>;
def Imm0_15Operand : AsmImmRange<0, 15>;
def Imm0_31Operand : AsmImmRange<0, 31>;
def Imm0_63Operand : AsmImmRange<0, 63>;
def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) < 8);
}]> {
let EncoderMethod = "getVecShiftL8OpValue";
let DecoderMethod = "DecodeVecShiftL8Imm";
let ParserMatchClass = Imm0_7Operand;
def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) < 16);
}]> {
let EncoderMethod = "getVecShiftL16OpValue";
let DecoderMethod = "DecodeVecShiftL16Imm";
let ParserMatchClass = Imm0_15Operand;
def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) < 32);
}]> {
let EncoderMethod = "getVecShiftL32OpValue";
let DecoderMethod = "DecodeVecShiftL32Imm";
let ParserMatchClass = Imm0_31Operand;
def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
return (((uint32_t)Imm) < 64);
}]> {
let EncoderMethod = "getVecShiftL64OpValue";
let DecoderMethod = "DecodeVecShiftL64Imm";
let ParserMatchClass = Imm0_63Operand;
// Same as vecshiftL#N, but use TargetConstant (TimmLeaf) instead of Constant
// (ImmLeaf)
def tvecshiftL8 : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) < 8);
}]> {
let EncoderMethod = "getVecShiftL8OpValue";
let DecoderMethod = "DecodeVecShiftL8Imm";
let ParserMatchClass = Imm0_7Operand;
def tvecshiftL16 : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) < 16);
}]> {
let EncoderMethod = "getVecShiftL16OpValue";
let DecoderMethod = "DecodeVecShiftL16Imm";
let ParserMatchClass = Imm0_15Operand;
def tvecshiftL32 : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) < 32);
}]> {
let EncoderMethod = "getVecShiftL32OpValue";
let DecoderMethod = "DecodeVecShiftL32Imm";
let ParserMatchClass = Imm0_31Operand;
def tvecshiftL64 : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) < 64);
}]> {
let EncoderMethod = "getVecShiftL64OpValue";
let DecoderMethod = "DecodeVecShiftL64Imm";
let ParserMatchClass = Imm0_63Operand;
// Crazy immediate formats used by 32-bit and 64-bit logical immediate
// instructions for splatting repeating bit patterns across the immediate.
def logical_imm32_XFORM : SDNodeXForm<imm, [{
uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
def logical_imm64_XFORM : SDNodeXForm<imm, [{
uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
def gi_logical_imm32_XFORM : GICustomOperandRenderer<"renderLogicalImm32">,
def gi_logical_imm64_XFORM : GICustomOperandRenderer<"renderLogicalImm64">,
let DiagnosticType = "LogicalSecondSource" in {
def LogicalImm32Operand : AsmOperandClass {
let Name = "LogicalImm32";
let PredicateMethod = "isLogicalImm<int32_t>";
let RenderMethod = "addLogicalImmOperands<int32_t>";
def LogicalImm64Operand : AsmOperandClass {
let Name = "LogicalImm64";
let PredicateMethod = "isLogicalImm<int64_t>";
let RenderMethod = "addLogicalImmOperands<int64_t>";
def LogicalImm32NotOperand : AsmOperandClass {
let Name = "LogicalImm32Not";
let PredicateMethod = "isLogicalImm<int32_t>";
let RenderMethod = "addLogicalImmNotOperands<int32_t>";
def LogicalImm64NotOperand : AsmOperandClass {
let Name = "LogicalImm64Not";
let PredicateMethod = "isLogicalImm<int64_t>";
let RenderMethod = "addLogicalImmNotOperands<int64_t>";
def logical_imm32 : Operand<i32>, IntImmLeaf<i32, [{
return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 32);
}], logical_imm32_XFORM> {
let PrintMethod = "printLogicalImm<int32_t>";
let ParserMatchClass = LogicalImm32Operand;
def logical_imm64 : Operand<i64>, IntImmLeaf<i64, [{
return AArch64_AM::isLogicalImmediate(Imm.getZExtValue(), 64);
}], logical_imm64_XFORM> {
let PrintMethod = "printLogicalImm<int64_t>";
let ParserMatchClass = LogicalImm64Operand;
def logical_imm32_not : Operand<i32> {
let ParserMatchClass = LogicalImm32NotOperand;
def logical_imm64_not : Operand<i64> {
let ParserMatchClass = LogicalImm64NotOperand;
// iXX_imm0_65535 predicates - True if the immediate is in the range [0,65535].
let ParserMatchClass = AsmImmRange<0, 65535>, PrintMethod = "printImmHex" in {
def i32_imm0_65535 : Operand<i32>, TImmLeaf<i32, [{
return ((uint32_t)Imm) < 65536;
def i64_imm0_65535 : Operand<i64>, TImmLeaf<i64, [{
return ((uint64_t)Imm) < 65536;
// imm0_255 predicate - True if the immediate is in the range [0,255].
def Imm0_255Operand : AsmImmRange<0,255>;
def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 256;
}]> {
let ParserMatchClass = Imm0_255Operand;
let PrintMethod = "printImm";
// imm0_127 predicate - True if the immediate is in the range [0,127]
def Imm0_127Operand : AsmImmRange<0, 127>;
def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 128;
}]> {
let ParserMatchClass = Imm0_127Operand;
let PrintMethod = "printImm";
def imm0_127_64b : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 128;
}]> {
let ParserMatchClass = Imm0_127Operand;
let PrintMethod = "printImm";
// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
// for all shift-amounts.
// imm0_63 predicate - True if the immediate is in the range [0,63]
def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 64;
}]> {
let ParserMatchClass = Imm0_63Operand;
// imm0_31 predicate - True if the immediate is in the range [0,31]
def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 32;
}]> {
let ParserMatchClass = Imm0_31Operand;
// timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf)
// instead of Constant (ImmLeaf)
def timm0_31 : Operand<i64>, TImmLeaf<i64, [{
return ((uint64_t)Imm) < 32;
}]> {
let ParserMatchClass = Imm0_31Operand;
// True if the 32-bit immediate is in the range [0,31]
def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
return ((uint64_t)Imm) < 32;
}]> {
let ParserMatchClass = Imm0_31Operand;
// imm0_1 predicate - True if the immediate is in the range [0,1]
def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 2;
}]> {
let ParserMatchClass = Imm0_1Operand;
// imm0_15 predicate - True if the immediate is in the range [0,15]
def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 16;
}]> {
let ParserMatchClass = Imm0_15Operand;
// imm0_7 predicate - True if the immediate is in the range [0,7]
def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
return ((uint64_t)Imm) < 8;
}]> {
let ParserMatchClass = Imm0_7Operand;
// imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
def imm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
return ((uint32_t)Imm) < 8;
}]> {
let ParserMatchClass = Imm0_7Operand;
// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 16;
}]> {
let ParserMatchClass = Imm0_15Operand;
// An arithmetic shifter operand:
// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
// {5-0} - imm6
class arith_shift<ValueType Ty, int width> : Operand<Ty> {
let PrintMethod = "printShifter";
let ParserMatchClass = !cast<AsmOperandClass>(
"ArithmeticShifterOperand" # width);
def arith_shift32 : arith_shift<i32, 32>;
def arith_shift64 : arith_shift<i64, 64>;
class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
: Operand<Ty>,
ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
let PrintMethod = "printShiftedRegister";
let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
def gi_arith_shifted_reg32 :
GIComplexOperandMatcher<s32, "selectArithShiftedRegister">,
def gi_arith_shifted_reg64 :
GIComplexOperandMatcher<s64, "selectArithShiftedRegister">,
// An arithmetic shifter operand:
// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
// {5-0} - imm6
class logical_shift<int width> : Operand<i32> {
let PrintMethod = "printShifter";
let ParserMatchClass = !cast<AsmOperandClass>(
"LogicalShifterOperand" # width);
def logical_shift32 : logical_shift<32>;
def logical_shift64 : logical_shift<64>;
class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
: Operand<Ty>,
ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
let PrintMethod = "printShiftedRegister";
let MIOperandInfo = (ops regclass, shiftop);
def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
def gi_logical_shifted_reg32 :
GIComplexOperandMatcher<s32, "selectLogicalShiftedRegister">,
def gi_logical_shifted_reg64 :
GIComplexOperandMatcher<s64, "selectLogicalShiftedRegister">,
// A logical vector shifter operand:
// {7-6} - shift type: 00 = lsl
// {5-0} - imm6: #0, #8, #16, or #24
def logical_vec_shift : Operand<i32> {
let PrintMethod = "printShifter";
let EncoderMethod = "getVecShifterOpValue";
let ParserMatchClass = LogicalVecShifterOperand;
// A logical vector half-word shifter operand:
// {7-6} - shift type: 00 = lsl
// {5-0} - imm6: #0 or #8
def logical_vec_hw_shift : Operand<i32> {
let PrintMethod = "printShifter";
let EncoderMethod = "getVecShifterOpValue";
let ParserMatchClass = LogicalVecHalfWordShifterOperand;
// A vector move shifter operand:
// {0} - imm1: #8 or #16
def move_vec_shift : Operand<i32> {
let PrintMethod = "printShifter";
let EncoderMethod = "getMoveVecShifterOpValue";
let ParserMatchClass = MoveVecShifterOperand;
let DiagnosticType = "AddSubSecondSource" in {
def AddSubImmOperand : AsmOperandClass {
let Name = "AddSubImm";
let ParserMethod = "tryParseImmWithOptionalShift";
let RenderMethod = "addImmWithOptionalShiftOperands<12>";
def AddSubImmNegOperand : AsmOperandClass {
let Name = "AddSubImmNeg";
let ParserMethod = "tryParseImmWithOptionalShift";
let RenderMethod = "addImmNegWithOptionalShiftOperands<12>";
// An ADD/SUB immediate shifter operand:
// second operand:
// {7-6} - shift type: 00 = lsl
// {5-0} - imm6: #0 or #12
class addsub_shifted_imm<ValueType Ty>
: Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
let PrintMethod = "printAddSubImm";
let EncoderMethod = "getAddSubImmOpValue";
let ParserMatchClass = AddSubImmOperand;
let MIOperandInfo = (ops i32imm, i32imm);
class addsub_shifted_imm_neg<ValueType Ty>
: Operand<Ty> {
let EncoderMethod = "getAddSubImmOpValue";
let ParserMatchClass = AddSubImmNegOperand;
let MIOperandInfo = (ops i32imm, i32imm);
def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;
def gi_addsub_shifted_imm32 :
GIComplexOperandMatcher<s32, "selectArithImmed">,
def gi_addsub_shifted_imm64 :
GIComplexOperandMatcher<s64, "selectArithImmed">,
class neg_addsub_shifted_imm<ValueType Ty>
: Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
let PrintMethod = "printAddSubImm";
let EncoderMethod = "getAddSubImmOpValue";
let ParserMatchClass = AddSubImmOperand;
let MIOperandInfo = (ops i32imm, i32imm);
def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
def gi_neg_addsub_shifted_imm32 :
GIComplexOperandMatcher<s32, "selectNegArithImmed">,
def gi_neg_addsub_shifted_imm64 :
GIComplexOperandMatcher<s64, "selectNegArithImmed">,
// An extend operand:
// {5-3} - extend type
// {2-0} - imm3
def arith_extend : Operand<i32> {
let PrintMethod = "printArithExtend";
let ParserMatchClass = ExtendOperand;
def arith_extend64 : Operand<i32> {
let PrintMethod = "printArithExtend";
let ParserMatchClass = ExtendOperand64;
// 'extend' that's a lsl of a 64-bit register.
def arith_extendlsl64 : Operand<i32> {
let PrintMethod = "printArithExtend";
let ParserMatchClass = ExtendOperandLSL64;
class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
let PrintMethod = "printExtendedRegister";
let MIOperandInfo = (ops GPR32, arith_extend);
class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
let PrintMethod = "printExtendedRegister";
let MIOperandInfo = (ops GPR32, arith_extend64);
def arith_extended_reg32_i32 : arith_extended_reg32<i32>;
def gi_arith_extended_reg32_i32 :
GIComplexOperandMatcher<s32, "selectArithExtendedRegister">,
def arith_extended_reg32_i64 : arith_extended_reg32<i64>;
def gi_arith_extended_reg32_i64 :
GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
def arith_extended_reg32to64_i64 : arith_extended_reg32to64<i64>;
def gi_arith_extended_reg32to64_i64 :
GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
// Floating-point immediate.
def fpimm16 : Operand<f16>,
FPImmLeaf<f16, [{
return AArch64_AM::getFP16Imm(Imm) != -1;
}], SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = AArch64_AM::getFP16Imm(InVal);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
}]>> {
let ParserMatchClass = FPImmOperand;
let PrintMethod = "printFPImmOperand";
def fpimm32 : Operand<f32>,
FPImmLeaf<f32, [{
return AArch64_AM::getFP32Imm(Imm) != -1;
}], SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = AArch64_AM::getFP32Imm(InVal);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
}]>> {
let ParserMatchClass = FPImmOperand;
let PrintMethod = "printFPImmOperand";
def fpimm64 : Operand<f64>,
FPImmLeaf<f64, [{
return AArch64_AM::getFP64Imm(Imm) != -1;
}], SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = AArch64_AM::getFP64Imm(InVal);
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
}]>> {
let ParserMatchClass = FPImmOperand;
let PrintMethod = "printFPImmOperand";
def fpimm8 : Operand<i32> {
let ParserMatchClass = FPImmOperand;
let PrintMethod = "printFPImmOperand";
def fpimm0 : FPImmLeaf<fAny, [{
return Imm.isExactlyValue(+0.0);
// Vector lane operands
class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
let Name = NamePrefix # "IndexRange" # Min # "_" # Max;
let DiagnosticType = "Invalid" # Name;
let PredicateMethod = "isVectorIndex<" # Min # ", " # Max # ">";
let RenderMethod = "addVectorIndexOperands";
class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc>
: Operand<ty> {
let ParserMatchClass = mc;
let PrintMethod = "printVectorIndex";
multiclass VectorIndex<ValueType ty, AsmOperandClass mc, code pred> {
def "" : AsmVectorIndexOpnd<ty, mc>, ImmLeaf<ty, pred>;
def _timm : AsmVectorIndexOpnd<ty, mc>, TImmLeaf<ty, pred>;
def VectorIndex1Operand : AsmVectorIndex<1, 1>;
def VectorIndexBOperand : AsmVectorIndex<0, 15>;
def VectorIndexHOperand : AsmVectorIndex<0, 7>;
def VectorIndexSOperand : AsmVectorIndex<0, 3>;
def VectorIndexDOperand : AsmVectorIndex<0, 1>;
defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand,
[{ return ((uint64_t)Imm) == 1; }]>;
defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand,
[{ return ((uint64_t)Imm) < 16; }]>;
defm VectorIndexH : VectorIndex<i64, VectorIndexHOperand,
[{ return ((uint64_t)Imm) < 8; }]>;
defm VectorIndexS : VectorIndex<i64, VectorIndexSOperand,
[{ return ((uint64_t)Imm) < 4; }]>;
defm VectorIndexD : VectorIndex<i64, VectorIndexDOperand,
[{ return ((uint64_t)Imm) < 2; }]>;
defm VectorIndex132b : VectorIndex<i32, VectorIndex1Operand,
[{ return ((uint64_t)Imm) == 1; }]>;
defm VectorIndexB32b : VectorIndex<i32, VectorIndexBOperand,
[{ return ((uint64_t)Imm) < 16; }]>;
defm VectorIndexH32b : VectorIndex<i32, VectorIndexHOperand,
[{ return ((uint64_t)Imm) < 8; }]>;
defm VectorIndexS32b : VectorIndex<i32, VectorIndexSOperand,
[{ return ((uint64_t)Imm) < 4; }]>;
defm VectorIndexD32b : VectorIndex<i32, VectorIndexDOperand,
[{ return ((uint64_t)Imm) < 2; }]>;
def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">;
def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;
defm sve_elm_idx_extdup_b
: VectorIndex<i64, SVEVectorIndexExtDupBOperand,
[{ return ((uint64_t)Imm) < 64; }]>;
defm sve_elm_idx_extdup_h
: VectorIndex<i64, SVEVectorIndexExtDupHOperand,
[{ return ((uint64_t)Imm) < 32; }]>;
defm sve_elm_idx_extdup_s
: VectorIndex<i64, SVEVectorIndexExtDupSOperand,
[{ return ((uint64_t)Imm) < 16; }]>;
defm sve_elm_idx_extdup_d
: VectorIndex<i64, SVEVectorIndexExtDupDOperand,
[{ return ((uint64_t)Imm) < 8; }]>;
defm sve_elm_idx_extdup_q
: VectorIndex<i64, SVEVectorIndexExtDupQOperand,
[{ return ((uint64_t)Imm) < 4; }]>;
// 8-bit immediate for AdvSIMD where 64-bit values of the form:
// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
// are encoded as the eight bit value 'abcdefgh'.
def simdimmtype10 : Operand<i32>,
FPImmLeaf<f64, [{
return AArch64_AM::isAdvSIMDModImmType10(
}], SDNodeXForm<fpimm, [{
APFloat InVal = N->getValueAPF();
uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
}]>> {
let ParserMatchClass = SIMDImmType10Operand;
let PrintMethod = "printSIMDType10Operand";
// System management
// Base encoding for system instruction operands.
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands,
list<dag> pattern = []>
: I<oops, iops, asm, operands, "", pattern> {
let Inst{31-22} = 0b1101010100;
let Inst{21} = L;
// System instructions which do not have an Rt register.
class SimpleSystemI<bit L, dag iops, string asm, string operands,
list<dag> pattern = []>
: BaseSystemI<L, (outs), iops, asm, operands, pattern> {
let Inst{4-0} = 0b11111;
// System instructions which have an Rt register.
class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
: BaseSystemI<L, oops, iops, asm, operands>,
Sched<[WriteSys]> {
bits<5> Rt;
let Inst{4-0} = Rt;
// System instructions for transactional memory extension
class TMBaseSystemI<bit L, bits<4> CRm, bits<3> op2, dag oops, dag iops,
string asm, string operands, list<dag> pattern>
: BaseSystemI<L, oops, iops, asm, operands, pattern>,
Sched<[WriteSys]> {
let Inst{20-12} = 0b000110011;
let Inst{11-8} = CRm;
let Inst{7-5} = op2;
let DecoderMethod = "";
let mayLoad = 1;
let mayStore = 1;
// System instructions for transactional memory - single input operand
class TMSystemI<bits<4> CRm, string asm, list<dag> pattern>
: TMBaseSystemI<0b1, CRm, 0b011,
(outs GPR64:$Rt), (ins), asm, "\t$Rt", pattern> {
bits<5> Rt;
let Inst{4-0} = Rt;
// System instructions for transactional memory - no operand
class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern>
: TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> {
let Inst{4-0} = 0b11111;
// System instructions for exit from transactions
class TMSystemException<bits<3> op1, string asm, list<dag> pattern>
: I<(outs), (ins i64_imm0_65535:$imm), asm, "\t$imm", "", pattern>,
Sched<[WriteSys]> {
bits<16> imm;
let Inst{31-24} = 0b11010100;
let Inst{23-21} = op1;
let Inst{20-5} = imm;
let Inst{4-0} = 0b00000;
// Hint instructions that take both a CRm and a 3-bit immediate.
// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
// model patterns with sufficiently fine granularity
let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
class HintI<string mnemonic>
: SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "",
[(int_aarch64_hint imm0_127:$imm)]>,
Sched<[WriteHint]> {
bits <7> imm;
let Inst{20-12} = 0b000110010;
let Inst{11-5} = imm;
// System instructions taking a single literal operand which encodes into
// CRm. op2 differentiates the opcodes.
def BarrierAsmOperand : AsmOperandClass {
let Name = "Barrier";
let ParserMethod = "tryParseBarrierOperand";
def barrier_op : Operand<i32> {
let PrintMethod = "printBarrierOption";
let ParserMatchClass = BarrierAsmOperand;
class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
list<dag> pattern = []>
: SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
Sched<[WriteBarrier]> {
bits<4> CRm;
let Inst{20-12} = 0b000110011;
let Inst{11-8} = CRm;
let Inst{7-5} = opc;
class SystemNoOperands<bits<3> op2, string asm, list<dag> pattern = []>
: SimpleSystemI<0, (ins), asm, "", pattern>,
Sched<[]> {
bits<4> CRm;
let CRm = 0b0011;
let Inst{31-12} = 0b11010101000000110010;
let Inst{11-8} = CRm;
let Inst{7-5} = op2;
let Inst{4-0} = 0b11111;
// MRS/MSR system instructions. These have different operand classes because
// a different subset of registers can be accessed through each instruction.
def MRSSystemRegisterOperand : AsmOperandClass {
let Name = "MRSSystemRegister";
let ParserMethod = "tryParseSysReg";
let DiagnosticType = "MRS";
// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate.
def mrs_sysreg_op : Operand<i32> {
let ParserMatchClass = MRSSystemRegisterOperand;
let DecoderMethod = "DecodeMRSSystemRegister";
let PrintMethod = "printMRSSystemRegister";
def MSRSystemRegisterOperand : AsmOperandClass {
let Name = "MSRSystemRegister";
let ParserMethod = "tryParseSysReg";
let DiagnosticType = "MSR";
def msr_sysreg_op : Operand<i32> {
let ParserMatchClass = MSRSystemRegisterOperand;
let DecoderMethod = "DecodeMSRSystemRegister";
let PrintMethod = "printMSRSystemRegister";
def PSBHintOperand : AsmOperandClass {
let Name = "PSBHint";
let ParserMethod = "tryParsePSBHint";
def psbhint_op : Operand<i32> {
let ParserMatchClass = PSBHintOperand;
let PrintMethod = "printPSBHintOp";
let MCOperandPredicate = [{
// Check, if operand is valid, to fix exhaustive aliasing in disassembly.
// "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
if (!MCOp.isImm())
return false;
return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr;
def BTIHintOperand : AsmOperandClass {
let Name = "BTIHint";
let ParserMethod = "tryParseBTIHint";
def btihint_op : Operand<i32> {
let ParserMatchClass = BTIHintOperand;
let PrintMethod = "printBTIHintOp";
let MCOperandPredicate = [{
// "bti" is an alias to "hint" only for certain values of CRm:Op2 fields.
if (!MCOp.isImm())
return false;
return AArch64BTIHint::lookupBTIByEncoding((MCOp.getImm() ^ 32) >> 1) != nullptr;
class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
"mrs", "\t$Rt, $systemreg"> {
bits<16> systemreg;
let Inst{20-5} = systemreg;
// FIXME: Some of these def NZCV, others don't. Best way to model that?
// Explicitly modeling each of the system register as a register class
// would do it, but feels like overkill at this point.
class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
"msr", "\t$systemreg, $Rt"> {
bits<16> systemreg;
let Inst{20-5} = systemreg;
def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
let Name = "SystemPStateFieldWithImm0_15";
let ParserMethod = "tryParseSysReg";
def pstatefield4_op : Operand<i32> {
let ParserMatchClass = SystemPStateFieldWithImm0_15Operand;
let PrintMethod = "printSystemPStateField";
// Instructions to modify PSTATE, no input reg
let Defs = [NZCV] in
class PstateWriteSimple<dag iops, string asm, string operands>
: SimpleSystemI<0, iops, asm, operands> {
let Inst{20-19} = 0b00;
let Inst{15-12} = 0b0100;
class MSRpstateImm0_15
: PstateWriteSimple<(ins pstatefield4_op:$pstatefield, imm0_15:$imm), "msr",
"\t$pstatefield, $imm">,
Sched<[WriteSys]> {
bits<6> pstatefield;
bits<4> imm;
let Inst{18-16} = pstatefield{5-3};
let Inst{11-8} = imm;
let Inst{7-5} = pstatefield{2-0};
let DecoderMethod = "DecodeSystemPStateInstruction";
// MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
// Fail the decoder should attempt to decode the instruction as MSRI.
let hasCompleteDecoder = 0;
def SystemPStateFieldWithImm0_1Operand : AsmOperandClass {
let Name = "SystemPStateFieldWithImm0_1";
let ParserMethod = "tryParseSysReg";
def pstatefield1_op : Operand<i32> {
let ParserMatchClass = SystemPStateFieldWithImm0_1Operand;
let PrintMethod = "printSystemPStateField";
class MSRpstateImm0_1
: PstateWriteSimple<(ins pstatefield1_op:$pstatefield, imm0_1:$imm), "msr",
"\t$pstatefield, $imm">,
Sched<[WriteSys]> {
bits<6> pstatefield;
bit imm;
let Inst{18-16} = pstatefield{5-3};
let Inst{11-9} = 0b000;
let Inst{8} = imm;
let Inst{7-5} = pstatefield{2-0};
let DecoderMethod = "DecodeSystemPStateInstruction";
// MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
// Fail the decoder should attempt to decode the instruction as MSRI.
let hasCompleteDecoder = 0;
// SYS and SYSL generic system instructions.
def SysCRAsmOperand : AsmOperandClass {
let Name = "SysCR";
let ParserMethod = "tryParseSysCROperand";
def sys_cr_op : Operand<i32> {
let PrintMethod = "printSysCROperand";
let ParserMatchClass = SysCRAsmOperand;
class SystemXtI<bit L, string asm>
: RtSystemI<L, (outs),
(ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
bits<3> op1;
bits<4> Cn;
bits<4> Cm;
bits<3> op2;
let Inst{20-19} = 0b01;
let Inst{18-16} = op1;
let Inst{15-12} = Cn;
let Inst{11-8} = Cm;
let Inst{7-5} = op2;
class SystemLXtI<bit L, string asm>
: RtSystemI<L, (outs),
(ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
bits<3> op1;
bits<4> Cn;
bits<4> Cm;
bits<3> op2;
let Inst{20-19} = 0b01;
let Inst{18-16} = op1;
let Inst{15-12} = Cn;
let Inst{11-8} = Cm;
let Inst{7-5} = op2;
// Branch (register) instructions:
// case opc of
// 0001 blr
// 0000 br
// 0101 dret
// 0100 eret
// 0010 ret
// otherwise UNDEFINED
class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
string operands, list<dag> pattern>
: I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
let Inst{31-25} = 0b1101011;
let Inst{24-21} = opc;
let Inst{20-16} = 0b11111;
let Inst{15-10} = 0b000000;
let Inst{4-0} = 0b00000;
class BranchReg<bits<4> opc, string asm, list<dag> pattern>
: BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
bits<5> Rn;
let Inst{9-5} = Rn;
let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
class SpecialReturn<bits<4> opc, string asm>
: BaseBranchReg<opc, (outs), (ins), asm, "", []> {
let Inst{9-5} = 0b11111;
let mayLoad = 1 in
class RCPCLoad<bits<2> sz, string asm, RegisterClass RC>
: I<(outs RC:$Rt), (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]", "", []>,
Sched<[]> {
bits<5> Rn;
bits<5> Rt;
let Inst{31-30} = sz;
let Inst{29-10} = 0b11100010111111110000;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
class AuthBase<bits<1> M, dag oops, dag iops, string asm, string operands,
list<dag> pattern>
: I<oops, iops, asm, operands, "", pattern>, Sched<[]> {
let isAuthenticated = 1;
let Inst{31-25} = 0b1101011;
let Inst{20-11} = 0b1111100001;
let Inst{10} = M;
let Inst{4-0} = 0b11111;
class AuthBranchTwoOperands<bits<1> op, bits<1> M, string asm>
: AuthBase<M, (outs), (ins GPR64:$Rn, GPR64sp:$Rm), asm, "\t$Rn, $Rm", []> {
bits<5> Rn;
bits<5> Rm;
let Inst{24-22} = 0b100;
let Inst{21} = op;
let Inst{9-5} = Rn;
let Inst{4-0} = Rm;
class AuthOneOperand<bits<3> opc, bits<1> M, string asm>
: AuthBase<M, (outs), (ins GPR64:$Rn), asm, "\t$Rn", []> {
bits<5> Rn;
let Inst{24} = 0;
let Inst{23-21} = opc;
let Inst{9-5} = Rn;
let Uses = [LR,SP] in
class AuthReturn<bits<3> op, bits<1> M, string asm>
: AuthBase<M, (outs), (ins), asm, "", []> {
let Inst{24} = 0;
let Inst{23-21} = op;
let Inst{9-0} = 0b1111111111;
let mayLoad = 1 in
class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm,
string operands, string cstr, Operand opr>
: I<oops, iops, asm, operands, cstr, []>, Sched<[]> {
bits<10> offset;
bits<5> Rn;
bits<5> Rt;
let isAuthenticated = 1;
let Inst{31-24} = 0b11111000;
let Inst{23} = M;
let Inst{22} = offset{9};
let Inst{21} = 1;
let Inst{20-12} = offset{8-0};
let Inst{11} = W;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodeAuthLoadInstruction";
multiclass AuthLoad<bit M, string asm, Operand opr> {
def indexed : BaseAuthLoad<M, 0, (outs GPR64:$Rt),
(ins GPR64sp:$Rn, opr:$offset),
asm, "\t$Rt, [$Rn, $offset]", "", opr>;
def writeback : BaseAuthLoad<M, 1, (outs GPR64sp:$wback, GPR64:$Rt),
(ins GPR64sp:$Rn, opr:$offset),
asm, "\t$Rt, [$Rn, $offset]!",
"$Rn = $wback,@earlyclobber $wback", opr>;
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>;
def : InstAlias<asm # "\t$Rt, [$wback]!",
(!cast<Instruction>(NAME # "writeback") GPR64sp:$wback, GPR64:$Rt, 0), 0>;
// Conditional branch instruction.
// Condition code.
// 4-bit immediate. Pretty-printed as <cc>
def ccode : Operand<i32> {
let PrintMethod = "printCondCode";
let ParserMatchClass = CondCode;
def inv_ccode : Operand<i32> {
// AL and NV are invalid in the aliases which use inv_ccode
let PrintMethod = "printInverseCondCode";
let ParserMatchClass = CondCode;
let MCOperandPredicate = [{
return MCOp.isImm() &&
MCOp.getImm() != AArch64CC::AL &&
MCOp.getImm() != AArch64CC::NV;
// Conditional branch target. 19-bit immediate. The low two bits of the target
// offset are implied zero and so are not part of the immediate.
def am_brcond : Operand<OtherVT> {
let EncoderMethod = "getCondBranchTargetOpValue";
let DecoderMethod = "DecodePCRelLabel19";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = PCRelLabel19Operand;
let OperandType = "OPERAND_PCREL";
class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
"b", ".$cond\t$target", "",
[(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
Sched<[WriteBr]> {
let isBranch = 1;
let isTerminator = 1;
let Uses = [NZCV];
bits<4> cond;
bits<19> target;
let Inst{31-24} = 0b01010100;
let Inst{23-5} = target;
let Inst{4} = 0;
let Inst{3-0} = cond;
// Compare-and-branch instructions.
class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
: I<(outs), (ins regtype:$Rt, am_brcond:$target),
asm, "\t$Rt, $target", "",
[(node regtype:$Rt, bb:$target)]>,
Sched<[WriteBr]> {
let isBranch = 1;
let isTerminator = 1;
bits<5> Rt;
bits<19> target;
let Inst{30-25} = 0b011010;
let Inst{24} = op;
let Inst{23-5} = target;
let Inst{4-0} = Rt;
multiclass CmpBranch<bit op, string asm, SDNode node> {
def W : BaseCmpBranch<GPR32, op, asm, node> {
let Inst{31} = 0;
def X : BaseCmpBranch<GPR64, op, asm, node> {
let Inst{31} = 1;
// Test-bit-and-branch instructions.
// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
// the target offset are implied zero and so are not part of the immediate.
def am_tbrcond : Operand<OtherVT> {
let EncoderMethod = "getTestBranchTargetOpValue";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = BranchTarget14Operand;
let OperandType = "OPERAND_PCREL";
// AsmOperand classes to emit (or not) special diagnostics
def TBZImm0_31Operand : AsmOperandClass {
let Name = "TBZImm0_31";
let PredicateMethod = "isImmInRange<0,31>";
let RenderMethod = "addImmOperands";
def TBZImm32_63Operand : AsmOperandClass {
let Name = "Imm32_63";
let PredicateMethod = "isImmInRange<32,63>";
let DiagnosticType = "InvalidImm0_63";
let RenderMethod = "addImmOperands";
class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
return (((uint32_t)Imm) < 32);
}]> {
let ParserMatchClass = matcher;
def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
}]> {
let ParserMatchClass = TBZImm32_63Operand;
class BaseTestBranch<RegisterClass regtype, Operand immtype,
bit op, string asm, SDNode node>
: I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
asm, "\t$Rt, $bit_off, $target", "",
[(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
Sched<[WriteBr]> {
let isBranch = 1;
let isTerminator = 1;
bits<5> Rt;
bits<6> bit_off;
bits<14> target;
let Inst{30-25} = 0b011011;
let Inst{24} = op;
let Inst{23-19} = bit_off{4-0};
let Inst{18-5} = target;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodeTestAndBranch";
multiclass TestBranch<bit op, string asm, SDNode node> {
def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
let Inst{31} = 0;
def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
let Inst{31} = 1;
// Alias X-reg with 0-31 imm to W-Reg.
def : InstAlias<asm # "\t$Rd, $imm, $target",
(!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
(!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
tbz_imm0_31_diag:$imm, bb:$target)>;
// Unconditional branch (immediate) instructions.
def am_b_target : Operand<OtherVT> {
let EncoderMethod = "getBranchTargetOpValue";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = BranchTarget26Operand;
let OperandType = "OPERAND_PCREL";
def am_bl_target : Operand<i64> {
let EncoderMethod = "getBranchTargetOpValue";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = BranchTarget26Operand;
let OperandType = "OPERAND_PCREL";
class BImm<bit op, dag iops, string asm, list<dag> pattern>
: I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
bits<26> addr;
let Inst{31} = op;
let Inst{30-26} = 0b00101;
let Inst{25-0} = addr;
let DecoderMethod = "DecodeUnconditionalBranch";
class BranchImm<bit op, string asm, list<dag> pattern>
: BImm<op, (ins am_b_target:$addr), asm, pattern>;
class CallImm<bit op, string asm, list<dag> pattern>
: BImm<op, (ins am_bl_target:$addr), asm, pattern>;
// Basic one-operand data processing instructions.
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
SDPatternOperator node>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
[(set regtype:$Rd, (node regtype:$Rn))]>,
Sched<[WriteI, ReadI]> {
bits<5> Rd;
bits<5> Rn;
let Inst{30-13} = 0b101101011000000000;
let Inst{12-10} = opc;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass OneOperandData<bits<3> opc, string asm,
SDPatternOperator node = null_frag> {
def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
let Inst{31} = 0;
def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
let Inst{31} = 1;
class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
: BaseOneOperandData<opc, GPR32, asm, node> {
let Inst{31} = 0;
class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
: BaseOneOperandData<opc, GPR64, asm, node> {
let Inst{31} = 1;
class SignAuthOneData<bits<3> opcode_prefix, bits<2> opcode, string asm>
: I<(outs GPR64:$Rd), (ins GPR64sp:$Rn), asm, "\t$Rd, $Rn", "",
Sched<[WriteI, ReadI]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-15} = 0b11011010110000010;
let Inst{14-12} = opcode_prefix;
let Inst{11-10} = opcode;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class SignAuthZero<bits<3> opcode_prefix, bits<2> opcode, string asm>
: I<(outs GPR64:$Rd), (ins), asm, "\t$Rd", "", []>, Sched<[]> {
bits<5> Rd;
let Inst{31-15} = 0b11011010110000010;
let Inst{14-12} = opcode_prefix;
let Inst{11-10} = opcode;
let Inst{9-5} = 0b11111;
let Inst{4-0} = Rd;
class SignAuthTwoOperand<bits<4> opc, string asm,
SDPatternOperator OpNode>
: I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64sp:$Rm),
asm, "\t$Rd, $Rn, $Rm", "",
[(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64sp:$Rm))]>,
Sched<[WriteI, ReadI, ReadI]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31-21} = 0b10011010110;
let Inst{20-16} = Rm;
let Inst{15-14} = 0b00;
let Inst{13-10} = opc;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
// Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
: I<(outs), iops, asm, ops, "", []>,
Sched<[WriteI, ReadI, ReadI]> {
let Uses = [NZCV];
bits<5> Rn;
let Inst{31} = sf;
let Inst{30-15} = 0b0111010000000000;
let Inst{14} = sz;
let Inst{13-10} = 0b0010;
let Inst{9-5} = Rn;
let Inst{4-0} = 0b01101;
class FlagRotate<dag iops, string asm, string ops>
: BaseFlagManipulation<0b1, 0b0, iops, asm, ops> {
bits<6> imm;
bits<4> mask;
let Inst{20-15} = imm;
let Inst{13-10} = 0b0001;
let Inst{4} = 0b0;
let Inst{3-0} = mask;
// Basic two-operand data processing instructions.
class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
asm, "\t$Rd, $Rn, $Rm", "", pattern>,
Sched<[WriteI, ReadI, ReadI]> {
let Uses = [NZCV];
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{30} = isSub;
let Inst{28-21} = 0b11010000;
let Inst{20-16} = Rm;
let Inst{15-10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
SDNode OpNode>
: BaseBaseAddSubCarry<isSub, regtype, asm,
[(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
SDNode OpNode>
: BaseBaseAddSubCarry<isSub, regtype, asm,
[(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
(implicit NZCV)]> {
let Defs = [NZCV];
multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
SDNode OpNode, SDNode OpNode_setflags> {
def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
let Inst{31} = 0;
let Inst{29} = 0;
def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
let Inst{31} = 1;
let Inst{29} = 0;
// Sets flags.
def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
OpNode_setflags> {
let Inst{31} = 0;
let Inst{29} = 1;
def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
OpNode_setflags> {
let Inst{31} = 1;
let Inst{29} = 1;
class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
SDPatternOperator OpNode,
RegisterClass in1regtype = regtype,
RegisterClass in2regtype = regtype>
: I<(outs regtype:$Rd), (ins in1regtype:$Rn, in2regtype:$Rm),
asm, "\t$Rd, $Rn, $Rm", "",
[(set regtype:$Rd, (OpNode in1regtype:$Rn, in2regtype:$Rm))]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{30-21} = 0b0011010110;
let Inst{20-16} = Rm;
let Inst{15-14} = 0b00;
let Inst{13-10} = opc;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
SDPatternOperator OpNode>
: BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
let Inst{10} = isSigned;
multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
Sched<[WriteID32, ReadID, ReadID]> {
let Inst{31} = 0;
def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
Sched<[WriteID64, ReadID, ReadID]> {
let Inst{31} = 1;
class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
SDPatternOperator OpNode = null_frag>
: BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
Sched<[WriteIS, ReadI]> {
let Inst{11-10} = shift_type;
multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
def Wr : BaseShift<shift_type, GPR32, asm> {
let Inst{31} = 0;
def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
let Inst{31} = 1;
def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
(!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
(EXTRACT_SUBREG i64:$Rm, sub_32))>;
def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
(!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
(!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
(!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (sext GPR32:$Rm)))),
(!cast<Instruction>(NAME # "Xr") GPR64:$Rn,
(SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>;
def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (zext GPR32:$Rm)))),
(!cast<Instruction>(NAME # "Xr") GPR64:$Rn,
(SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>;
class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
: InstAlias<asm#"\t$dst, $src1, $src2",
(inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
RegisterClass addtype, string asm,
list<dag> pattern>
: I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<5> Ra;
let Inst{30-24} = 0b0011011;
let Inst{23-21} = opc;
let Inst{20-16} = Rm;
let Inst{15} = isSub;
let Inst{14-10} = Ra;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
// MADD/MSUB generation is decided by MachineCombiner.cpp
def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
[/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
let Inst{31} = 0;
def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
[/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
let Inst{31} = 1;
class WideMulAccum<bit isSub, bits<3> opc, string asm,
SDNode AccNode, SDNode ExtNode>
: BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
[(set GPR64:$Rd, (AccNode GPR64:$Ra,
(mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
let Inst{31} = 1;
class MulHi<bits<3> opc, string asm, SDNode OpNode>
: I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
asm, "\t$Rd, $Rn, $Rm", "",
[(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
Sched<[WriteIM64, ReadIM, ReadIM]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31-24} = 0b10011011;
let Inst{23-21} = opc;
let Inst{20-16} = Rm;
let Inst{15} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
// The Ra field of SMULH and UMULH is unused: it should be assembled as 31
// (i.e. all bits 1) but is ignored by the processor.
let PostEncoderMethod = "fixMulHigh";
class MulAccumWAlias<string asm, Instruction inst>
: InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
class MulAccumXAlias<string asm, Instruction inst>
: InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
class WideMulAccumAlias<string asm, Instruction inst>
: InstAlias<asm#"\t$dst, $src1, $src2",
(inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
SDPatternOperator OpNode, string asm>
: I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
asm, "\t$Rd, $Rn, $Rm", "",
[(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
Sched<[WriteISReg, ReadI, ReadISReg]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31} = sf;
let Inst{30-21} = 0b0011010110;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b010;
let Inst{12} = C;
let Inst{11-10} = sz;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let Predicates = [HasCRC];
// Address generation.
class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
: I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
Sched<[WriteI]> {
bits<5> Xd;
bits<21> label;
let Inst{31} = page;
let Inst{30-29} = label{1-0};
let Inst{28-24} = 0b10000;
let Inst{23-5} = label{20-2};
let Inst{4-0} = Xd;
let DecoderMethod = "DecodeAdrInstruction";
// Move immediate.
def movimm32_imm : Operand<i32> {
let ParserMatchClass = AsmImmRange<0, 65535>;
let EncoderMethod = "getMoveWideImmOpValue";
let PrintMethod = "printImm";
def movimm32_shift : Operand<i32> {
let PrintMethod = "printShifter";
let ParserMatchClass = MovImm32ShifterOperand;
def movimm64_shift : Operand<i32> {
let PrintMethod = "printShifter";
let ParserMatchClass = MovImm64ShifterOperand;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
string asm>
: I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
asm, "\t$Rd, $imm$shift", "", []>,
Sched<[WriteImm]> {
bits<5> Rd;
bits<16> imm;
bits<6> shift;
let Inst{30-29} = opc;
let Inst{28-23} = 0b100101;
let Inst{22-21} = shift{5-4};
let Inst{20-5} = imm;
let Inst{4-0} = Rd;
let DecoderMethod = "DecodeMoveImmInstruction";
multiclass MoveImmediate<bits<2> opc, string asm> {
def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
let Inst{31} = 0;
def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
let Inst{31} = 1;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
string asm>
: I<(outs regtype:$Rd),
(ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
Sched<[WriteI, ReadI]> {
bits<5> Rd;
bits<16> imm;
bits<6> shift;
let Inst{30-29} = opc;
let Inst{28-23} = 0b100101;
let Inst{22-21} = shift{5-4};
let Inst{20-5} = imm;
let Inst{4-0} = Rd;
let DecoderMethod = "DecodeMoveImmInstruction";
multiclass InsertImmediate<bits<2> opc, string asm> {
def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
let Inst{31} = 0;
def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
let Inst{31} = 1;
// Add/Subtract
class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
string asm_inst, string asm_ops,
dag inputs, dag pattern>
: I<(outs dstRegtype:$Rd), inputs, asm_inst, asm_ops, "", [pattern]>,
Sched<[WriteI, ReadI]> {
bits<5> Rd;
bits<5> Rn;
let Inst{30} = isSub;
let Inst{29} = setFlags;
let Inst{28-24} = 0b10001;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class AddSubImmShift<bit isSub, bit setFlags, RegisterClass dstRegtype,
RegisterClass srcRegtype, addsub_shifted_imm immtype,
string asm_inst, SDPatternOperator OpNode>
: BaseAddSubImm<isSub, setFlags, dstRegtype, asm_inst, "\t$Rd, $Rn, $imm",
(ins srcRegtype:$Rn, immtype:$imm),
(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))> {
bits<14> imm;
let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
let Inst{21-10} = imm{11-0};
let DecoderMethod = "DecodeAddSubImmShift";
class BaseAddSubRegPseudo<RegisterClass regtype,
SDPatternOperator OpNode>
: Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
[(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
Sched<[WriteI, ReadI, ReadI]>;
class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
arith_shifted_reg shifted_regtype, string asm,
SDPatternOperator OpNode>
: I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
asm, "\t$Rd, $Rn, $Rm", "",
[(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
Sched<[WriteISReg, ReadI, ReadISReg]> {
// The operands are in order to match the 'addr' MI operands, so we
// don't need an encoder method and by-name matching. Just use the default
// in-order handling. Since we're using by-order, make sure the names
// do not match.
bits<5> dst;
bits<5> src1;
bits<5> src2;
bits<8> shift;
let Inst{30} = isSub;
let Inst{29} = setFlags;
let Inst{28-24} = 0b01011;
let Inst{23-22} = shift{7-6};
let Inst{21} = 0;
let Inst{20-16} = src2;
let Inst{15-10} = shift{5-0};
let Inst{9-5} = src1;
let Inst{4-0} = dst;
let DecoderMethod = "DecodeThreeAddrSRegInstruction";
class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
RegisterClass src1Regtype, Operand src2Regtype,
string asm, SDPatternOperator OpNode>
: I<(outs dstRegtype:$R1),
(ins src1Regtype:$R2, src2Regtype:$R3),
asm, "\t$R1, $R2, $R3", "",
[(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
Sched<[WriteIEReg, ReadI, ReadIEReg]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<6> ext;
let Inst{30} = isSub;
let Inst{29} = setFlags;
let Inst{28-24} = 0b01011;
let Inst{23-21} = 0b001;
let Inst{20-16} = Rm;
let Inst{15-13} = ext{5-3};
let Inst{12-10} = ext{2-0};
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let DecoderMethod = "DecodeAddSubERegInstruction";
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
RegisterClass src1Regtype, RegisterClass src2Regtype,
Operand ext_op, string asm>
: I<(outs dstRegtype:$Rd),
(ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
Sched<[WriteIEReg, ReadI, ReadIEReg]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<6> ext;
let Inst{30} = isSub;
let Inst{29} = setFlags;
let Inst{28-24} = 0b01011;
let Inst{23-21} = 0b001;
let Inst{20-16} = Rm;
let Inst{15} = ext{5};
let Inst{12-10} = ext{2-0};
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let DecoderMethod = "DecodeAddSubERegInstruction";
// Aliases for register+register add/subtract.
class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
RegisterClass src1Regtype, RegisterClass src2Regtype,
int shiftExt>
: InstAlias<asm#"\t$dst, $src1, $src2",
(inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
multiclass AddSub<bit isSub, string mnemonic, string alias,
SDPatternOperator OpNode = null_frag> {
let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
// Add/Subtract immediate
// Increase the weight of the immediate variant to try to match it before
// the extended register variant.
// We used to match the register variant before the immediate when the
// register argument could be implicitly zero-extended.
let AddedComplexity = 6 in
def Wri : AddSubImmShift<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
mnemonic, OpNode> {
let Inst{31} = 0;
let AddedComplexity = 6 in
def Xri : AddSubImmShift<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
mnemonic, OpNode> {
let Inst{31} = 1;
// Add/Subtract register - Only used for CodeGen
def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
// Add/Subtract shifted register
def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
OpNode> {
let Inst{31} = 0;
def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
OpNode> {
let Inst{31} = 1;
// Add/Subtract extended register
let AddedComplexity = 1, hasSideEffects = 0 in {
def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
arith_extended_reg32_i32, mnemonic, OpNode> {
let Inst{31} = 0;
def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
arith_extended_reg32to64_i64, mnemonic, OpNode> {
let Inst{31} = 1;
def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
arith_extendlsl64, mnemonic> {
// UXTX and SXTX only.
let Inst{14-13} = 0b11;
let Inst{31} = 1;
// add Rd, Rb, -imm -> sub Rd, Rn, imm
def : InstSubst<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
addsub_shifted_imm32_neg:$imm), 0>;
def : InstSubst<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
addsub_shifted_imm64_neg:$imm), 0>;
// Register/register aliases with no shift when SP is not used.
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
GPR32, GPR32, GPR32, 0>;
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
GPR64, GPR64, GPR64, 0>;
// Register/register aliases with no shift when either the destination or
// first source register is SP.
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
def : AddSubRegAlias<mnemonic,
GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
def : AddSubRegAlias<mnemonic,
GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
string alias, string cmpAlias> {
let isCompare = 1, Defs = [NZCV] in {
// Add/Subtract immediate
def Wri : AddSubImmShift<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
mnemonic, OpNode> {
let Inst{31} = 0;
def Xri : AddSubImmShift<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
mnemonic, OpNode> {
let Inst{31} = 1;
// Add/Subtract register
def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
// Add/Subtract shifted register
def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
OpNode> {
let Inst{31} = 0;
def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
OpNode> {
let Inst{31} = 1;
// Add/Subtract extended register
let AddedComplexity = 1 in {
def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
arith_extended_reg32_i32, mnemonic, OpNode> {
let Inst{31} = 0;
def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
arith_extended_reg32_i64, mnemonic, OpNode> {
let Inst{31} = 1;
def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
arith_extendlsl64, mnemonic> {
// UXTX and SXTX only.
let Inst{14-13} = 0b11;
let Inst{31} = 1;
} // Defs = [NZCV]
// Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
def : InstSubst<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
addsub_shifted_imm32_neg:$imm), 0>;
def : InstSubst<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
addsub_shifted_imm64_neg:$imm), 0>;
// Compare aliases
def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
// Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
// Compare shorthands
def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
// Register/register aliases with no shift when SP is not used.
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
GPR32, GPR32, GPR32, 0>;
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
GPR64, GPR64, GPR64, 0>;
// Register/register aliases with no shift when the first source register
// is SP.
def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
def : AddSubRegAlias<mnemonic,
GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
class AddSubG<bit isSub, string asm_inst, SDPatternOperator OpNode>
: BaseAddSubImm<
isSub, 0, GPR64sp, asm_inst, "\t$Rd, $Rn, $imm6, $imm4",
(ins GPR64sp:$Rn, uimm6s16:$imm6, imm0_15:$imm4),
(set GPR64sp:$Rd, (OpNode GPR64sp:$Rn, imm0_63:$imm6, imm0_15:$imm4))> {
bits<6> imm6;
bits<4> imm4;
let Inst{31} = 1;
let Inst{23-22} = 0b10;
let Inst{21-16} = imm6;
let Inst{15-14} = 0b00;
let Inst{13-10} = imm4;
let Unpredictable{15-14} = 0b11;
class SUBP<bit setsFlags, string asm_instr, SDPatternOperator OpNode>
: BaseTwoOperand<0b0000, GPR64, asm_instr, OpNode, GPR64sp, GPR64sp> {
let Inst{31} = 1;
let Inst{29} = setsFlags;
// Extract
def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
list<dag> patterns>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
Sched<[WriteExtr, ReadExtrHi]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<6> imm;
let Inst{30-23} = 0b00100111;
let Inst{21} = 0;
let Inst{20-16} = Rm;
let Inst{15-10} = imm;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass ExtractImm<string asm> {
def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
[(set GPR32:$Rd,
(AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
let Inst{31} = 0;
let Inst{22} = 0;
// imm<5> must be zero.
let imm{5} = 0;
def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
[(set GPR64:$Rd,
(AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
let Inst{31} = 1;
let Inst{22} = 1;
// Bitfield
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseBitfieldImm<bits<2> opc,
RegisterClass regtype, Operand imm_type, string asm>
: I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
Sched<[WriteIS, ReadI]> {
bits<5> Rd;
bits<5> Rn;
bits<6> immr;
bits<6> imms;
let Inst{30-29} = opc;
let Inst{28-23} = 0b100110;
let Inst{21-16} = immr;
let Inst{15-10} = imms;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass BitfieldImm<bits<2> opc, string asm> {
def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
let Inst{31} = 0;
let Inst{22} = 0;
// imms<5> and immr<5> must be zero, else ReservedValue().
let Inst{21} = 0;
let Inst{15} = 0;
def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
let Inst{31} = 1;
let Inst{22} = 1;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseBitfieldImmWith2RegArgs<bits<2> opc,
RegisterClass regtype, Operand imm_type, string asm>
: I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
Sched<[WriteIS, ReadI]> {
bits<5> Rd;
bits<5> Rn;
bits<6> immr;
bits<6> imms;
let Inst{30-29} = opc;
let Inst{28-23} = 0b100110;
let Inst{21-16} = immr;
let Inst{15-10} = imms;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
let Inst{31} = 0;
let Inst{22} = 0;
// imms<5> and immr<5> must be zero, else ReservedValue().
let Inst{21} = 0;
let Inst{15} = 0;
def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
let Inst{31} = 1;
let Inst{22} = 1;
// Logical
// Logical (immediate)
class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
RegisterClass sregtype, Operand imm_type, string asm,
list<dag> pattern>
: I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
asm, "\t$Rd, $Rn, $imm", "", pattern>,
Sched<[WriteI, ReadI]> {
bits<5> Rd;
bits<5> Rn;
bits<13> imm;
let Inst{30-29} = opc;
let Inst{28-23} = 0b100100;
let Inst{22} = imm{12};
let Inst{21-16} = imm{11-6};
let Inst{15-10} = imm{5-0};
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let DecoderMethod = "DecodeLogicalImmInstruction";
// Logical (shifted register)
class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
logical_shifted_reg shifted_regtype, string asm,
list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
asm, "\t$Rd, $Rn, $Rm", "", pattern>,
Sched<[WriteISReg, ReadI, ReadISReg]> {
// The operands are in order to match the 'addr' MI operands, so we
// don't need an encoder method and by-name matching. Just use the default
// in-order handling. Since we're using by-order, make sure the names
// do not match.
bits<5> dst;
bits<5> src1;
bits<5> src2;
bits<8> shift;
let Inst{30-29} = opc;
let Inst{28-24} = 0b01010;
let Inst{23-22} = shift{7-6};
let Inst{21} = N;
let Inst{20-16} = src2;
let Inst{15-10} = shift{5-0};
let Inst{9-5} = src1;
let Inst{4-0} = dst;
let DecoderMethod = "DecodeThreeAddrSRegInstruction";
// Aliases for register+register logical instructions.
class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
: InstAlias<asm#"\t$dst, $src1, $src2",
(inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
string Alias> {
let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
[(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
logical_imm32:$imm))]> {
let Inst{31} = 0;
let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
[(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
logical_imm64:$imm))]> {
let Inst{31} = 1;
def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
logical_imm32_not:$imm), 0>;
def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
logical_imm64_not:$imm), 0>;
multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
string Alias> {
let isCompare = 1, Defs = [NZCV] in {
def Wri : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
[(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
let Inst{31} = 0;
let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
def Xri : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
[(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
let Inst{31} = 1;
} // end Defs = [NZCV]
def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
logical_imm32_not:$imm), 0>;
def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
logical_imm64_not:$imm), 0>;
class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
: Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
[(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
Sched<[WriteI, ReadI, ReadI]>;
// Split from LogicalImm as not all instructions have both.
multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
SDPatternOperator OpNode> {
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
[(set GPR32:$Rd, (OpNode GPR32:$Rn,
logical_shifted_reg32:$Rm))]> {
let Inst{31} = 0;
def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
[(set GPR64:$Rd, (OpNode GPR64:$Rn,
logical_shifted_reg64:$Rm))]> {
let Inst{31} = 1;
def : LogicalRegAlias<mnemonic,
!cast<Instruction>(NAME#"Wrs"), GPR32>;
def : LogicalRegAlias<mnemonic,
!cast<Instruction>(NAME#"Xrs"), GPR64>;
// Split from LogicalReg to allow setting NZCV Defs
multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
SDPatternOperator OpNode = null_frag> {
let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
[(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
let Inst{31} = 0;
def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
[(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
let Inst{31} = 1;
} // Defs = [NZCV]
def : LogicalRegAlias<mnemonic,
!cast<Instruction>(NAME#"Wrs"), GPR32>;
def : LogicalRegAlias<mnemonic,
!cast<Instruction>(NAME#"Xrs"), GPR64>;
// Conditionally set flags
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype,
string mnemonic, SDNode OpNode>
: I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond),
mnemonic, "\t$Rn, $imm, $nzcv, $cond", "",
[(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv),
(i32 imm:$cond), NZCV))]>,
Sched<[WriteI, ReadI]> {
let Uses = [NZCV];
let Defs = [NZCV];
bits<5> Rn;
bits<5> imm;
bits<4> nzcv;
bits<4> cond;
let Inst{30} = op;
let Inst{29-21} = 0b111010010;
let Inst{20-16} = imm;
let Inst{15-12} = cond;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = nzcv;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic,
SDNode OpNode>
: I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "",
[(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv),
(i32 imm:$cond), NZCV))]>,
Sched<[WriteI, ReadI, ReadI]> {
let Uses = [NZCV];
let Defs = [NZCV];
bits<5> Rn;
bits<5> Rm;
bits<4> nzcv;
bits<4> cond;
let Inst{30} = op;
let Inst{29-21} = 0b111010010;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
let Inst{11-10} = 0b00;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = nzcv;
multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> {
// immediate operand variants
def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> {
let Inst{31} = 0;
def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> {
let Inst{31} = 1;
// register operand variants
def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> {
let Inst{31} = 0;
def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> {
let Inst{31} = 1;
// Conditional select
class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
asm, "\t$Rd, $Rn, $Rm, $cond", "",
[(set regtype:$Rd,
(AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
Sched<[WriteI, ReadI, ReadI]> {
let Uses = [NZCV];
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<4> cond;
let Inst{30} = op;
let Inst{29-21} = 0b011010100;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
let Inst{11-10} = op2;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass CondSelect<bit op, bits<2> op2, string asm> {
def Wr : BaseCondSelect<op, op2, GPR32, asm> {
let Inst{31} = 0;
def Xr : BaseCondSelect<op, op2, GPR64, asm> {
let Inst{31} = 1;
class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
PatFrag frag>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
asm, "\t$Rd, $Rn, $Rm, $cond", "",
[(set regtype:$Rd,
(AArch64csel regtype:$Rn, (frag regtype:$Rm),
(i32 imm:$cond), NZCV))]>,
Sched<[WriteI, ReadI, ReadI]> {
let Uses = [NZCV];
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<4> cond;
let Inst{30} = op;
let Inst{29-21} = 0b011010100;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
let Inst{11-10} = op2;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
def inv_cond_XFORM : SDNodeXForm<imm, [{
AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), SDLoc(N),
multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
let Inst{31} = 0;
def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
let Inst{31} = 1;
def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
(!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
(inv_cond_XFORM imm:$cond))>;
def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
(!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
(inv_cond_XFORM imm:$cond))>;
// Special Mask Value
def maski8_or_more : Operand<i32>,
ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
def maski16_or_more : Operand<i32>,
ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
// Load/store
// (unsigned immediate)
// Indexed for 8-bit registers. offset is in range [0,4095].
def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
def gi_am_indexed8 :
GIComplexOperandMatcher<s64, "selectAddrModeIndexed<8>">,
def gi_am_indexed16 :
GIComplexOperandMatcher<s64, "selectAddrModeIndexed<16>">,
def gi_am_indexed32 :
GIComplexOperandMatcher<s64, "selectAddrModeIndexed<32>">,
def gi_am_indexed64 :
GIComplexOperandMatcher<s64, "selectAddrModeIndexed<64>">,
def gi_am_indexed128 :
GIComplexOperandMatcher<s64, "selectAddrModeIndexed<128>">,
class UImm12OffsetOperand<int Scale> : AsmOperandClass {
let Name = "UImm12Offset" # Scale;
let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
let PredicateMethod = "isUImm12Offset<" # Scale # ">";
let DiagnosticType = "InvalidMemoryIndexed" # Scale;
def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
class uimm12_scaled<int Scale> : Operand<i64> {
let ParserMatchClass
= !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
let EncoderMethod
= "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
let PrintMethod = "printUImm12Offset<" # Scale # ">";
def uimm12s1 : uimm12_scaled<1>;
def uimm12s2 : uimm12_scaled<2>;
def uimm12s4 : uimm12_scaled<4>;
def uimm12s8 : uimm12_scaled<8>;
def uimm12s16 : uimm12_scaled<16>;
class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
string asm, list<dag> pattern>
: I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
bits<5> Rt;
bits<5> Rn;
bits<12> offset;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b01;
let Inst{23-22} = opc;
let Inst{21-10} = offset;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodeUnsignedLdStInstruction";
multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
Operand indextype, string asm, list<dag> pattern> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
(ins GPR64sp:$Rn, indextype:$offset),
asm, pattern>,
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
Operand indextype, string asm, list<dag> pattern> {
let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def ui : BaseLoadStoreUI<sz, V, opc, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
asm, pattern>,
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
// Same as StoreUI, but take a RegisterOperand. This is used by GlobalISel to
// substitute zero-registers automatically.
// TODO: Roll out zero-register subtitution to GPR32/GPR64 and fold this back
// into StoreUI.
multiclass StoreUIz<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
Operand indextype, string asm, list<dag> pattern> {
let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def ui : BaseLoadStoreUI<sz, V, opc, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
asm, pattern>,
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
def PrefetchOperand : AsmOperandClass {
let Name = "Prefetch";
let ParserMethod = "tryParsePrefetch";
def prfop : Operand<i32> {
let PrintMethod = "printPrefetchOp";
let ParserMatchClass = PrefetchOperand;
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
: BaseLoadStoreUI<sz, V, opc,
(outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
asm, pat>,
// Load literal
// Load literal address: 19-bit immediate. The low two bits of the target
// offset are implied zero and so are not part of the immediate.
def am_ldrlit : Operand<iPTR> {
let EncoderMethod = "getLoadLiteralOpValue";
let DecoderMethod = "DecodePCRelLabel19";
let PrintMethod = "printAlignedLabel";
let ParserMatchClass = PCRelLabel19Operand;
let OperandType = "OPERAND_PCREL";
let mayLoad = 1, mayStore = 0, hasSideEffects = 0, AddedComplexity = 20 in
class LoadLiteral<bits<2> opc, bit V, RegisterOperand regtype, string asm, list<dag> pat>
: I<(outs regtype:$Rt), (ins am_ldrlit:$label),
asm, "\t$Rt, $label", "", pat>,
Sched<[WriteLD]> {
bits<5> Rt;
bits<19> label;
let Inst{31-30} = opc;
let Inst{29-27} = 0b011;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-5} = label;
let Inst{4-0} = Rt;
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
: I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
asm, "\t$Rt, $label", "", pat>,
Sched<[WriteLD]> {
bits<5> Rt;
bits<19> label;
let Inst{31-30} = opc;
let Inst{29-27} = 0b011;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-5} = label;
let Inst{4-0} = Rt;
// Load/store register offset
def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
def gi_ro_Xindexed8 :
GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">,
def gi_ro_Xindexed16 :
GIComplexOperandMatcher<s64, "selectAddrModeXRO<16>">,
def gi_ro_Xindexed32 :
GIComplexOperandMatcher<s64, "selectAddrModeXRO<32>">,
def gi_ro_Xindexed64 :
GIComplexOperandMatcher<s64, "selectAddrModeXRO<64>">,
def gi_ro_Xindexed128 :
GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">,
def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
def gi_ro_Windexed8 :
GIComplexOperandMatcher<s64, "selectAddrModeWRO<8>">,
def gi_ro_Windexed16 :
GIComplexOperandMatcher<s64, "selectAddrModeWRO<16>">,
def gi_ro_Windexed32 :
GIComplexOperandMatcher<s64, "selectAddrModeWRO<32>">,
def gi_ro_Windexed64 :
GIComplexOperandMatcher<s64, "selectAddrModeWRO<64>">,
def gi_ro_Windexed128 :
GIComplexOperandMatcher<s64, "selectAddrModeWRO<128>">,
class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
let Name = "Mem" # Reg # "Extend" # Width;
let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
let RenderMethod = "addMemExtendOperands";
let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
def MemWExtend8Operand : MemExtendOperand<"W", 8> {
// The address "[x0, x1, lsl #0]" actually maps to the variant which performs
// the trivial shift.
let RenderMethod = "addMemExtend8Operands";
def MemWExtend16Operand : MemExtendOperand<"W", 16>;
def MemWExtend32Operand : MemExtendOperand<"W", 32>;
def MemWExtend64Operand : MemExtendOperand<"W", 64>;
def MemWExtend128Operand : MemExtendOperand<"W", 128>;
def MemXExtend8Operand : MemExtendOperand<"X", 8> {
// The address "[x0, x1, lsl #0]" actually maps to the variant which performs
// the trivial shift.
let RenderMethod = "addMemExtend8Operands";
def MemXExtend16Operand : MemExtendOperand<"X", 16>;
def MemXExtend32Operand : MemExtendOperand<"X", 32>;
def MemXExtend64Operand : MemExtendOperand<"X", 64>;
def MemXExtend128Operand : MemExtendOperand<"X", 128>;
class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
: Operand<i32> {
let ParserMatchClass = ParserClass;
let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
let DecoderMethod = "DecodeMemExtend";
let EncoderMethod = "getMemExtendOpValue";
let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
def ro_Wextend8 : ro_extend<MemWExtend8Operand, "w", 8>;
def ro_Wextend16 : ro_extend<MemWExtend16Operand, "w", 16>;
def ro_Wextend32 : ro_extend<MemWExtend32Operand, "w", 32>;
def ro_Wextend64 : ro_extend<MemWExtend64Operand, "w", 64>;
def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
def ro_Xextend8 : ro_extend<MemXExtend8Operand, "x", 8>;
def ro_Xextend16 : ro_extend<MemXExtend16Operand, "x", 16>;
def ro_Xextend32 : ro_extend<MemXExtend32Operand, "x", 32>;
def ro_Xextend64 : ro_extend<MemXExtend64Operand, "x", 64>;
def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
Operand wextend, Operand xextend> {
// CodeGen-level pattern covering the entire addressing mode.
ComplexPattern Wpat = windex;
ComplexPattern Xpat = xindex;
// Asm-level Operand covering the valid "uxtw #3" style syntax.
Operand Wext = wextend;
Operand Xext = xextend;
def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
bits<5> Rn;
bits<5> Rm;
bits<2> extend;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15} = extend{1}; // sign extend Rm?
let Inst{14} = 1;
let Inst{12} = extend{0}; // do shift?
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
class ROInstAlias<string asm, RegisterOperand regtype, Instruction INST>
: InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
(INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore8RO<sz, V, opc, regtype, asm,
(outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10 in
def roX : LoadStore8RO<sz, V, opc, regtype, asm,
(outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
[(storeop (Ty regtype:$Rt),
(ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10 in
def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
[(storeop (Ty regtype:$Rt),
(ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
bits<5> Rn;
bits<5> Rm;
bits<2> extend;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15} = extend{1}; // sign extend Rm?
let Inst{14} = 1;
let Inst{12} = extend{0}; // do shift?
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10 in
def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
[(storeop (Ty regtype:$Rt),
(ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10 in
def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
[(storeop (Ty regtype:$Rt),
(ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
bits<5> Rn;
bits<5> Rm;
bits<2> extend;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15} = extend{1}; // sign extend Rm?
let Inst{14} = 1;
let Inst{12} = extend{0}; // do shift?
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10 in
def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10 in
def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10 in
def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
[(storeop (Ty regtype:$Rt),
(ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10 in
def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
[(storeop (Ty regtype:$Rt),
(ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
bits<5> Rn;
bits<5> Rm;
bits<2> extend;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15} = extend{1}; // sign extend Rm?
let Inst{14} = 1;
let Inst{12} = extend{0}; // do shift?
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
[(storeop (Ty regtype:$Rt),
(ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
[(storeop (Ty regtype:$Rt),
(ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, dag ins, dag outs, list<dag> pat>
: I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
bits<5> Rt;
bits<5> Rn;
bits<5> Rm;
bits<2> extend;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15} = extend{1}; // sign extend Rm?
let Inst{14} = 1;
let Inst{12} = extend{0}; // do shift?
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator loadop> {
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
(ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
[(set (Ty regtype:$Rt),
(loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
Sched<[WriteLDIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, ValueType Ty, SDPatternOperator storeop> {
let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b0;
let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
Sched<[WriteSTIdx, ReadAdrBase]> {
let Inst{13} = 0b1;
def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
string asm, list<dag> pat>
: I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
Sched<[WriteLD]> {
bits<5> Rt;
bits<5> Rn;
bits<5> Rm;
bits<2> extend;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15} = extend{1}; // sign extend Rm?
let Inst{14} = 1;
let Inst{12} = extend{0}; // do shift?
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
def roW : BasePrefetchRO<sz, V, opc, (outs),
(ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
asm, [(AArch64Prefetch imm:$Rt,
(ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend64:$extend))]> {
let Inst{13} = 0b0;
def roX : BasePrefetchRO<sz, V, opc, (outs),
(ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
asm, [(AArch64Prefetch imm:$Rt,
(ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
ro_Xextend64:$extend))]> {
let Inst{13} = 0b1;
def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
(!cast<Instruction>(NAME # "roX") prfop:$Rt,
GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
// Load/store unscaled immediate
def am_unscaled8 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
def gi_am_unscaled8 :
GIComplexOperandMatcher<s64, "selectAddrModeUnscaled8">,
def gi_am_unscaled16 :
GIComplexOperandMatcher<s64, "selectAddrModeUnscaled16">,
def gi_am_unscaled32 :
GIComplexOperandMatcher<s64, "selectAddrModeUnscaled32">,
def gi_am_unscaled64 :
GIComplexOperandMatcher<s64, "selectAddrModeUnscaled64">,
def gi_am_unscaled128 :
GIComplexOperandMatcher<s64, "selectAddrModeUnscaled128">,
class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
string asm, list<dag> pattern>
: I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
bits<5> Rt;
bits<5> Rn;
bits<9> offset;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 0;
let Inst{20-12} = offset;
let Inst{11-10} = 0b00;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodeSignedLdStInstruction";
// Armv8.4 LDAPR & STLR with Immediate Offset instruction
multiclass BaseLoadUnscaleV84<string asm, bits<2> sz, bits<2> opc,
RegisterOperand regtype > {
def i : BaseLoadStoreUnscale<sz, 0, opc, (outs regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset), asm, []>,
Sched<[WriteST]> {
let Inst{29} = 0;
let Inst{24} = 1;
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
multiclass BaseStoreUnscaleV84<string asm, bits<2> sz, bits<2> opc,
RegisterOperand regtype > {
def i : BaseLoadStoreUnscale<sz, 0, opc, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
asm, []>,
Sched<[WriteST]> {
let Inst{29} = 0;
let Inst{24} = 1;
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, list<dag> pattern> {
let AddedComplexity = 1 in // try this before LoadUI
def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, list<dag> pattern> {
let AddedComplexity = 1 in // try this before StoreUI
def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
asm, pattern>,
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
list<dag> pat> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
(ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
asm, pat>,
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
// Load/store unscaled immediate, unprivileged
class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
dag oops, dag iops, string asm>
: I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
bits<5> Rt;
bits<5> Rn;
bits<9> offset;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 0;
let Inst{20-12} = offset;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodeSignedLdStInstruction";
multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
RegisterClass regtype, string asm> {
let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset), asm>,
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
RegisterClass regtype, string asm> {
let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
// Load/store pre-indexed
class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
string asm, string cstr, list<dag> pat>
: I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
bits<5> Rt;
bits<5> Rn;
bits<9> offset;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0;
let Inst{23-22} = opc;
let Inst{21} = 0;
let Inst{20-12} = offset;
let Inst{11-10} = 0b11;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodeSignedLdStInstruction";
let hasSideEffects = 0 in {
let mayStore = 0, mayLoad = 1 in
class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm>
: BaseLoadStorePreIdx<sz, V, opc,
(outs GPR64sp:$wback, regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset), asm,
"$Rn = $wback,@earlyclobber $wback", []>,
Sched<[WriteLD, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, SDPatternOperator storeop, ValueType Ty>
: BaseLoadStorePreIdx<sz, V, opc,
(outs GPR64sp:$wback),
(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
asm, "$Rn = $wback,@earlyclobber $wback",
[(set GPR64sp:$wback,
(storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
Sched<[WriteAdr, WriteST]>;
} // hasSideEffects = 0
// Load/store post-indexed
class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
string asm, string cstr, list<dag> pat>
: I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
bits<5> Rt;
bits<5> Rn;
bits<9> offset;
let Inst{31-30} = sz;
let Inst{29-27} = 0b111;
let Inst{26} = V;
let Inst{25-24} = 0b00;
let Inst{23-22} = opc;
let Inst{21} = 0b0;
let Inst{20-12} = offset;
let Inst{11-10} = 0b01;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodeSignedLdStInstruction";
let hasSideEffects = 0 in {
let mayStore = 0, mayLoad = 1 in
class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm>
: BaseLoadStorePostIdx<sz, V, opc,
(outs GPR64sp:$wback, regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset),
asm, "$Rn = $wback,@earlyclobber $wback", []>,
Sched<[WriteLD, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype,
string asm, SDPatternOperator storeop, ValueType Ty>
: BaseLoadStorePostIdx<sz, V, opc,
(outs GPR64sp:$wback),
(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
asm, "$Rn = $wback,@earlyclobber $wback",
[(set GPR64sp:$wback,
(storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
Sched<[WriteAdr, WriteST]>;
} // hasSideEffects = 0
// Load/store pair
// (indexed, offset)
class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
string asm>
: I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
bits<5> Rt;
bits<5> Rt2;
bits<5> Rn;
bits<7> offset;
let Inst{31-30} = opc;
let Inst{29-27} = 0b101;
let Inst{26} = V;
let Inst{25-23} = 0b010;
let Inst{22} = L;
let Inst{21-15} = offset;
let Inst{14-10} = Rt2;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodePairLdStInstruction";
multiclass LoadPairOffset<bits<2> opc, bit V, RegisterOperand regtype,
Operand indextype, string asm> {
let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
def i : BaseLoadStorePairOffset<opc, V, 1,
(outs regtype:$Rt, regtype:$Rt2),
(ins GPR64sp:$Rn, indextype:$offset), asm>,
Sched<[WriteLD, WriteLDHi]>;
def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, 0)>;
multiclass StorePairOffset<bits<2> opc, bit V, RegisterOperand regtype,
Operand indextype, string asm> {
let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
(ins regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, indextype:$offset),
def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, 0)>;
// (pre-indexed)
class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
string asm>
: I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> {
bits<5> Rt;
bits<5> Rt2;
bits<5> Rn;
bits<7> offset;
let Inst{31-30} = opc;
let Inst{29-27} = 0b101;
let Inst{26} = V;
let Inst{25-23} = 0b011;
let Inst{22} = L;
let Inst{21-15} = offset;
let Inst{14-10} = Rt2;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodePairLdStInstruction";
let hasSideEffects = 0 in {
let mayStore = 0, mayLoad = 1 in
class LoadPairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
Operand indextype, string asm>
: BaseLoadStorePairPreIdx<opc, V, 1,
(outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
(ins GPR64sp:$Rn, indextype:$offset), asm>,
Sched<[WriteLD, WriteLDHi, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
class StorePairPreIdx<bits<2> opc, bit V, RegisterOperand regtype,
Operand indextype, string asm>
: BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
(ins regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, indextype:$offset),
Sched<[WriteAdr, WriteSTP]>;
} // hasSideEffects = 0
// (post-indexed)
class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
string asm>
: I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> {
bits<5> Rt;
bits<5> Rt2;
bits<5> Rn;
bits<7> offset;
let Inst{31-30} = opc;
let Inst{29-27} = 0b101;
let Inst{26} = V;
let Inst{25-23} = 0b001;
let Inst{22} = L;
let Inst{21-15} = offset;
let Inst{14-10} = Rt2;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodePairLdStInstruction";
let hasSideEffects = 0 in {
let mayStore = 0, mayLoad = 1 in
class LoadPairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
Operand idxtype, string asm>
: BaseLoadStorePairPostIdx<opc, V, 1,
(outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
(ins GPR64sp:$Rn, idxtype:$offset), asm>,
Sched<[WriteLD, WriteLDHi, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
class StorePairPostIdx<bits<2> opc, bit V, RegisterOperand regtype,
Operand idxtype, string asm>
: BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
(ins regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, idxtype:$offset),
Sched<[WriteAdr, WriteSTP]>;
} // hasSideEffects = 0
// (no-allocate)
class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
string asm>
: I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
bits<5> Rt;
bits<5> Rt2;
bits<5> Rn;
bits<7> offset;
let Inst{31-30} = opc;
let Inst{29-27} = 0b101;
let Inst{26} = V;
let Inst{25-23} = 0b000;
let Inst{22} = L;
let Inst{21-15} = offset;
let Inst{14-10} = Rt2;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let DecoderMethod = "DecodePairLdStInstruction";
multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
Operand indextype, string asm> {
let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
def i : BaseLoadStorePairNoAlloc<opc, V, 1,
(outs regtype:$Rt, regtype:$Rt2),
(ins GPR64sp:$Rn, indextype:$offset), asm>,
Sched<[WriteLD, WriteLDHi]>;
def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, 0)>;
multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
Operand indextype, string asm> {
let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
(ins regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, indextype:$offset),
def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
(!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
GPR64sp:$Rn, 0)>;
// Load/store exclusive
// True exclusive operations write to and/or read from the system's exclusive
// monitors, which as far as a compiler is concerned can be modelled as a
// random shared memory address. Hence LoadExclusive mayStore.
// Since these instructions have the undefined register bits set to 1 in
// their canonical form, we need a post encoder method to set those bits
// to 1 when encoding these instructions. We do this using the
// fixLoadStoreExclusive function. This function has template parameters:
// fixLoadStoreExclusive<int hasRs, int hasRt2>
// hasRs indicates that the instruction uses the Rs field, so we won't set
// it to 1 (and the same for Rt2). We don't need template parameters for
// the other register fields since Rt and Rn are always used.
let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
dag oops, dag iops, string asm, string operands>
: I<oops, iops, asm, operands, "", []> {
let Inst{31-30} = sz;
let Inst{29-24} = 0b001000;
let Inst{23} = o2;
let Inst{22} = L;
let Inst{21} = o1;
let Inst{15} = o0;
let DecoderMethod = "DecodeExclusiveLdStInstruction";
// Neither Rs nor Rt2 operands.
class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
dag oops, dag iops, string asm, string operands>
: BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
bits<5> Rt;
bits<5> Rn;
let Inst{20-16} = 0b11111;
let Unpredictable{20-16} = 0b11111;
let Inst{14-10} = 0b11111;
let Unpredictable{14-10} = 0b11111;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
// Simple load acquires don't set the exclusive monitor
let mayLoad = 1, mayStore = 0 in
class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
RegisterClass regtype, string asm>
: LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
(ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
RegisterClass regtype, string asm>
: LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
(ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
RegisterClass regtype, string asm>
: BaseLoadStoreExclusive<sz, o2, L, o1, o0,
(outs regtype:$Rt, regtype:$Rt2),
(ins GPR64sp0:$Rn), asm,
"\t$Rt, $Rt2, [$Rn]">,
Sched<[WriteLD, WriteLDHi]> {
bits<5> Rt;
bits<5> Rt2;
bits<5> Rn;
let Inst{14-10} = Rt2;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
// Simple store release operations do not check the exclusive monitor.
let mayLoad = 0, mayStore = 1 in
class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
RegisterClass regtype, string asm>
: LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
(ins regtype:$Rt, GPR64sp0:$Rn),
asm, "\t$Rt, [$Rn]">,
let mayLoad = 1, mayStore = 1 in
class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
RegisterClass regtype, string asm>
: BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
(ins regtype:$Rt, GPR64sp0:$Rn),
asm, "\t$Ws, $Rt, [$Rn]">,
Sched<[WriteSTX]> {
bits<5> Ws;
bits<5> Rt;
bits<5> Rn;
let Inst{20-16} = Ws;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let Constraints = "@earlyclobber $Ws";
let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
RegisterClass regtype, string asm>
: BaseLoadStoreExclusive<sz, o2, L, o1, o0,
(outs GPR32:$Ws),
(ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
Sched<[WriteSTX]> {
bits<5> Ws;
bits<5> Rt;
bits<5> Rt2;
bits<5> Rn;
let Inst{20-16} = Ws;
let Inst{14-10} = Rt2;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let Constraints = "@earlyclobber $Ws";
// Armv8.5-A Memory Tagging Extension
class BaseMemTag<bits<2> opc1, bits<2> opc2, string asm_insn,
string asm_opnds, string cstr, dag oops, dag iops>
: I<oops, iops, asm_insn, asm_opnds, cstr, []>,
Sched<[]> {
bits<5> Rn;
let Inst{31-24} = 0b11011001;
let Inst{23-22} = opc1;
let Inst{21} = 1;
// Inst{20-12} defined by subclass
let Inst{11-10} = opc2;
let Inst{9-5} = Rn;
// Inst{4-0} defined by subclass
class MemTagVector<bit Load, string asm_insn, string asm_opnds,
dag oops, dag iops>
: BaseMemTag<{0b1, Load}, 0b00, asm_insn, asm_opnds,
"", oops, iops> {
bits<5> Rt;
let Inst{20-12} = 0b000000000;
let Inst{4-0} = Rt;
let mayLoad = Load;
class MemTagLoad<string asm_insn, string asm_opnds>
: BaseMemTag<0b01, 0b00, asm_insn, asm_opnds, "$Rt = $wback",
(outs GPR64:$wback),
(ins GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)> {
bits<5> Rt;
bits<9> offset;
let Inst{20-12} = offset;
let Inst{4-0} = Rt;
let mayLoad = 1;
class BaseMemTagStore<bits<2> opc1, bits<2> opc2, string asm_insn,
string asm_opnds, string cstr, dag oops, dag iops>
: BaseMemTag<opc1, opc2, asm_insn, asm_opnds, cstr, oops, iops> {
bits<5> Rt;
bits<9> offset;
let Inst{20-12} = offset;
let Inst{4-0} = Rt;
let mayStore = 1;
multiclass MemTagStore<bits<2> opc1, string insn> {
def Offset :
BaseMemTagStore<opc1, 0b10, insn, "\t$Rt, [$Rn, $offset]", "",
(outs), (ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
def PreIndex :
BaseMemTagStore<opc1, 0b11, insn, "\t$Rt, [$Rn, $offset]!",
"$Rn = $wback",
(outs GPR64sp:$wback),
(ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
def PostIndex :
BaseMemTagStore<opc1, 0b01, insn, "\t$Rt, [$Rn], $offset",
"$Rn = $wback",
(outs GPR64sp:$wback),
(ins GPR64sp:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;
def : InstAlias<insn # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "Offset") GPR64sp:$Rt, GPR64sp:$Rn, 0)>;
// Exception generation
let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
: I<(outs), (ins i32_imm0_65535:$imm), asm, "\t$imm", "", []>,
Sched<[WriteSys]> {
bits<16> imm;
let Inst{31-24} = 0b11010100;
let Inst{23-21} = op1;
let Inst{20-5} = imm;
let Inst{4-2} = 0b000;
let Inst{1-0} = ll;
// UDF : Permanently UNDEFINED instructions. Format: Opc = 0x0000, 16 bit imm.
let hasSideEffects = 1, isTrap = 1, mayLoad = 0, mayStore = 0 in {
class UDFType<bits<16> opc, string asm>
: I<(outs), (ins uimm16:$imm),
asm, "\t$imm", "", []>,
Sched<[]> {
bits<16> imm;
let Inst{31-16} = opc;
let Inst{15-0} = imm;
let Predicates = [HasFPARMv8] in {
// Floating point to integer conversion
class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
RegisterClass srcType, RegisterClass dstType,
string asm, list<dag> pattern>
: I<(outs dstType:$Rd), (ins srcType:$Rn),
asm, "\t$Rd, $Rn", "", pattern>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
let Inst{30-29} = 0b00;
let Inst{28-24} = 0b11110;
let Inst{23-22} = type;
let Inst{21} = 1;
let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
let Inst{15-10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
RegisterClass srcType, RegisterClass dstType,
Operand immType, string asm, list<dag> pattern>
: I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
asm, "\t$Rd, $Rn, $scale", "", pattern>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
bits<6> scale;
let Inst{30-29} = 0b00;
let Inst{28-24} = 0b11110;
let Inst{23-22} = type;
let Inst{21} = 0;
let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
let Inst{15-10} = scale;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
SDPatternOperator OpN> {
// Unscaled half-precision to 32-bit
def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
[(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> {
let Inst{31} = 0; // 32-bit GPR flag
let Predicates = [HasFullFP16];
// Unscaled half-precision to 64-bit
def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
[(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> {
let Inst{31} = 1; // 64-bit GPR flag
let Predicates = [HasFullFP16];
// Unscaled single-precision to 32-bit
def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
[(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
let Inst{31} = 0; // 32-bit GPR flag
// Unscaled single-precision to 64-bit
def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
[(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
let Inst{31} = 1; // 64-bit GPR flag
// Unscaled double-precision to 32-bit
def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
[(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
let Inst{31} = 0; // 32-bit GPR flag
// Unscaled double-precision to 64-bit
def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
[(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
let Inst{31} = 1; // 64-bit GPR flag
multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
SDPatternOperator OpN> {
// Scaled half-precision to 32-bit
def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
fixedpoint_f16_i32, asm,
[(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn),
fixedpoint_f16_i32:$scale)))]> {
let Inst{31} = 0; // 32-bit GPR flag
let scale{5} = 1;
let Predicates = [HasFullFP16];
// Scaled half-precision to 64-bit
def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
fixedpoint_f16_i64, asm,
[(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn),
fixedpoint_f16_i64:$scale)))]> {
let Inst{31} = 1; // 64-bit GPR flag
let Predicates = [HasFullFP16];
// Scaled single-precision to 32-bit
def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
fixedpoint_f32_i32, asm,
[(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
fixedpoint_f32_i32:$scale)))]> {
let Inst{31} = 0; // 32-bit GPR flag
let scale{5} = 1;
// Scaled single-precision to 64-bit
def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
fixedpoint_f32_i64, asm,
[(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
fixedpoint_f32_i64:$scale)))]> {
let Inst{31} = 1; // 64-bit GPR flag
// Scaled double-precision to 32-bit
def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
fixedpoint_f64_i32, asm,
[(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
fixedpoint_f64_i32:$scale)))]> {
let Inst{31} = 0; // 32-bit GPR flag
let scale{5} = 1;
// Scaled double-precision to 64-bit
def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
fixedpoint_f64_i64, asm,
[(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
fixedpoint_f64_i64:$scale)))]> {
let Inst{31} = 1; // 64-bit GPR flag
// Integer to floating point conversion
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseIntegerToFP<bit isUnsigned,
RegisterClass srcType, RegisterClass dstType,
Operand immType, string asm, list<dag> pattern>
: I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
asm, "\t$Rd, $Rn, $scale", "", pattern>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
bits<6> scale;
let Inst{30-24} = 0b0011110;
let Inst{21-17} = 0b00001;
let Inst{16} = isUnsigned;
let Inst{15-10} = scale;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class BaseIntegerToFPUnscaled<bit isUnsigned,
RegisterClass srcType, RegisterClass dstType,
ValueType dvt, string asm, SDNode node>
: I<(outs dstType:$Rd), (ins srcType:$Rn),
asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
bits<6> scale;
let Inst{30-24} = 0b0011110;
let Inst{21-17} = 0b10001;
let Inst{16} = isUnsigned;
let Inst{15-10} = 0b000000;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
// Unscaled
def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b11; // 16-bit FPR flag
let Predicates = [HasFullFP16];
def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b00; // 32-bit FPR flag
def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b01; // 64-bit FPR flag
def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b11; // 16-bit FPR flag
let Predicates = [HasFullFP16];
def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b00; // 32-bit FPR flag
def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b01; // 64-bit FPR flag
// Scaled
def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
[(set (f16 FPR16:$Rd),
(fdiv (node GPR32:$Rn),
fixedpoint_f16_i32:$scale))]> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b11; // 16-bit FPR flag
let scale{5} = 1;
let Predicates = [HasFullFP16];
def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
[(set FPR32:$Rd,
(fdiv (node GPR32:$Rn),
fixedpoint_f32_i32:$scale))]> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b00; // 32-bit FPR flag
let scale{5} = 1;
def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
[(set FPR64:$Rd,
(fdiv (node GPR32:$Rn),
fixedpoint_f64_i32:$scale))]> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b01; // 64-bit FPR flag
let scale{5} = 1;
def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
[(set (f16 FPR16:$Rd),
(fdiv (node GPR64:$Rn),
fixedpoint_f16_i64:$scale))]> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b11; // 16-bit FPR flag
let Predicates = [HasFullFP16];
def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
[(set FPR32:$Rd,
(fdiv (node GPR64:$Rn),
fixedpoint_f32_i64:$scale))]> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b00; // 32-bit FPR flag
def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
[(set FPR64:$Rd,
(fdiv (node GPR64:$Rn),
fixedpoint_f64_i64:$scale))]> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b01; // 64-bit FPR flag
// Unscaled integer <-> floating point conversion (i.e. FMOV)
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
RegisterClass srcType, RegisterClass dstType,
string asm>
: I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
// We use COPY_TO_REGCLASS for these bitconvert operations.
// copyPhysReg() expands the resultant COPY instructions after
// regalloc is done. This gives greater freedom for the allocator
// and related passes (coalescing, copy propagation, et. al.) to
// be more effective.
[/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
Sched<[WriteFCopy]> {
bits<5> Rd;
bits<5> Rn;
let Inst{30-24} = 0b0011110;
let Inst{21} = 1;
let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
let Inst{15-10} = 0b000000;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
RegisterClass srcType, RegisterOperand dstType, string asm,
string kind>
: I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
"{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
Sched<[WriteFCopy]> {
bits<5> Rd;
bits<5> Rn;
let Inst{30-23} = 0b00111101;
let Inst{21} = 1;
let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
let Inst{15-10} = 0b000000;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let DecoderMethod = "DecodeFMOVLaneInstruction";
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
RegisterOperand srcType, RegisterClass dstType, string asm,
string kind>
: I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
"{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
Sched<[WriteFCopy]> {
bits<5> Rd;
bits<5> Rn;
let Inst{30-23} = 0b00111101;
let Inst{21} = 1;
let Inst{20-19} = rmode;
let Inst{18-16} = opcode;
let Inst{15-10} = 0b000000;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let DecoderMethod = "DecodeFMOVLaneInstruction";
multiclass UnscaledConversion<string asm> {
def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b11; // 16-bit FPR flag
let Predicates = [HasFullFP16];
def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b11; // 16-bit FPR flag
let Predicates = [HasFullFP16];
def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b00; // 32-bit FPR flag
def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b01; // 64-bit FPR flag
def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b11; // 16-bit FPR flag
let Predicates = [HasFullFP16];
def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b11; // 16-bit FPR flag
let Predicates = [HasFullFP16];
def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
let Inst{31} = 0; // 32-bit GPR flag
let Inst{23-22} = 0b00; // 32-bit FPR flag
def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
let Inst{31} = 1; // 64-bit GPR flag
let Inst{23-22} = 0b01; // 64-bit FPR flag
def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
asm, ".d"> {
let Inst{31} = 1;
let Inst{22} = 0;
def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
asm, ".d"> {
let Inst{31} = 1;
let Inst{22} = 0;
// Floating point conversion
class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
RegisterClass srcType, string asm, list<dag> pattern>
: I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-24} = 0b00011110;
let Inst{23-22} = type;
let Inst{21-17} = 0b10001;
let Inst{16-15} = opcode;
let Inst{14-10} = 0b10000;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass FPConversion<string asm> {
// Double-precision to Half-precision
def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
[(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>;
// Double-precision to Single-precision
def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
[(set FPR32:$Rd, (any_fpround FPR64:$Rn))]>;
// Half-precision to Double-precision
def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
[(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
// Half-precision to Single-precision
def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
[(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
// Single-precision to Double-precision
def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
[(set FPR64:$Rd, (fpextend FPR32:$Rn))]>;
// Single-precision to Half-precision
def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
[(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>;
// Single operand floating point data processing
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSingleOperandFPData<bits<6> opcode, RegisterClass regtype,
ValueType vt, string asm, SDPatternOperator node>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
[(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
Sched<[WriteF]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-24} = 0b00011110;
let Inst{21} = 0b1;
let Inst{20-15} = opcode;
let Inst{14-10} = 0b10000;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SingleOperandFPData<bits<4> opcode, string asm,
SDPatternOperator node = null_frag> {
def Hr : BaseSingleOperandFPData<{0b00,opcode}, FPR16, f16, asm, node> {
let Inst{23-22} = 0b11; // 16-bit size flag
let Predicates = [HasFullFP16];
def Sr : BaseSingleOperandFPData<{0b00,opcode}, FPR32, f32, asm, node> {
let Inst{23-22} = 0b00; // 32-bit size flag
def Dr : BaseSingleOperandFPData<{0b00,opcode}, FPR64, f64, asm, node> {
let Inst{23-22} = 0b01; // 64-bit size flag
multiclass SingleOperandFPNo16<bits<6> opcode, string asm,
SDPatternOperator node = null_frag>{
def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
let Inst{23-22} = 0b00; // 32-bit registers
def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
let Inst{23-22} = 0b01; // 64-bit registers
// FRInt[32|64][Z|N] instructions
multiclass FRIntNNT<bits<2> opcode, string asm, SDPatternOperator node = null_frag> :
SingleOperandFPNo16<{0b0100,opcode}, asm, node>;
// Two operand floating point data processing
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
string asm, list<dag> pat>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
asm, "\t$Rd, $Rn, $Rm", "", pat>,
Sched<[WriteF]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass TwoOperandFPData<bits<4> opcode, string asm,
SDPatternOperator node = null_frag> {
def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
[(set (f16 FPR16:$Rd),
(node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> {
let Inst{23-22} = 0b11; // 16-bit size flag
let Predicates = [HasFullFP16];
def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
[(set (f32 FPR32:$Rd),
(node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
let Inst{23-22} = 0b00; // 32-bit size flag
def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
[(set (f64 FPR64:$Rd),
(node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
let Inst{23-22} = 0b01; // 64-bit size flag
multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
[(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> {
let Inst{23-22} = 0b11; // 16-bit size flag
let Predicates = [HasFullFP16];
def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
[(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
let Inst{23-22} = 0b00; // 32-bit size flag
def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
[(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
let Inst{23-22} = 0b01; // 64-bit size flag
// Three operand floating point data processing
class BaseThreeOperandFPData<bit isNegated, bit isSub,
RegisterClass regtype, string asm, list<dag> pat>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
Sched<[WriteFMul]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<5> Ra;
let Inst{31-24} = 0b00011111;
let Inst{21} = isNegated;
let Inst{20-16} = Rm;
let Inst{15} = isSub;
let Inst{14-10} = Ra;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
SDPatternOperator node> {
def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
[(set (f16 FPR16:$Rd),
(node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
let Inst{23-22} = 0b11; // 16-bit size flag
let Predicates = [HasFullFP16];
def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
[(set FPR32:$Rd,
(node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
let Inst{23-22} = 0b00; // 32-bit size flag
def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
[(set FPR64:$Rd,
(node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
let Inst{23-22} = 0b01; // 64-bit size flag
// Floating point data comparisons
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseOneOperandFPComparison<bit signalAllNans,
RegisterClass regtype, string asm,
list<dag> pat>
: I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
Sched<[WriteFCmp]> {
bits<5> Rn;
let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{15-10} = 0b001000;
let Inst{9-5} = Rn;
let Inst{4} = signalAllNans;
let Inst{3-0} = 0b1000;
// Rm should be 0b00000 canonically, but we need to accept any value.
let PostEncoderMethod = "fixOneOperandFPComparison";
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
string asm, list<dag> pat>
: I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
Sched<[WriteFCmp]> {
bits<5> Rm;
bits<5> Rn;
let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b001000;
let Inst{9-5} = Rn;
let Inst{4} = signalAllNans;
let Inst{3-0} = 0b0000;
multiclass FPComparison<bit signalAllNans, string asm,
SDPatternOperator OpNode = null_frag> {
let Defs = [NZCV] in {
def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
[(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> {
let Inst{23-22} = 0b11;
let Predicates = [HasFullFP16];
def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm,
[(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> {
let Inst{23-22} = 0b11;
let Predicates = [HasFullFP16];
def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
[(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
let Inst{23-22} = 0b00;
def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
[(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
let Inst{23-22} = 0b00;
def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
[(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
let Inst{23-22} = 0b01;
def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
[(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
let Inst{23-22} = 0b01;
} // Defs = [NZCV]
// Floating point conditional comparisons
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
string mnemonic, list<dag> pat>
: I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>,
Sched<[WriteFCmp]> {
let Uses = [NZCV];
let Defs = [NZCV];
bits<5> Rn;
bits<5> Rm;
bits<4> nzcv;
bits<4> cond;
let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
let Inst{11-10} = 0b01;
let Inst{9-5} = Rn;
let Inst{4} = signalAllNans;
let Inst{3-0} = nzcv;
multiclass FPCondComparison<bit signalAllNans, string mnemonic,
SDPatternOperator OpNode = null_frag> {
def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic,
[(set NZCV, (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm), (i32 imm:$nzcv),
(i32 imm:$cond), NZCV))]> {
let Inst{23-22} = 0b11;
let Predicates = [HasFullFP16];
def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic,
[(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv),
(i32 imm:$cond), NZCV))]> {
let Inst{23-22} = 0b00;
def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic,
[(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv),
(i32 imm:$cond), NZCV))]> {
let Inst{23-22} = 0b01;
// Floating point conditional select
class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
asm, "\t$Rd, $Rn, $Rm, $cond", "",
[(set regtype:$Rd,
(AArch64csel (vt regtype:$Rn), regtype:$Rm,
(i32 imm:$cond), NZCV))]>,
Sched<[WriteF]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<4> cond;
let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = cond;
let Inst{11-10} = 0b11;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass FPCondSelect<string asm> {
let Uses = [NZCV] in {
def Hrrr : BaseFPCondSelect<FPR16, f16, asm> {
let Inst{23-22} = 0b11;
let Predicates = [HasFullFP16];
def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
let Inst{23-22} = 0b00;
def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
let Inst{23-22} = 0b01;
} // Uses = [NZCV]
// Floating move immediate
class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
: I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
[(set regtype:$Rd, fpimmtype:$imm)]>,
Sched<[WriteFImm]> {
bits<5> Rd;
bits<8> imm;
let Inst{31-24} = 0b00011110;
let Inst{21} = 1;
let Inst{20-13} = imm;
let Inst{12-5} = 0b10000000;
let Inst{4-0} = Rd;
multiclass FPMoveImmediate<string asm> {
def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> {
let Inst{23-22} = 0b11;
let Predicates = [HasFullFP16];
def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
let Inst{23-22} = 0b00;
def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
let Inst{23-22} = 0b01;
} // end of 'let Predicates = [HasFPARMv8]'
// AdvSIMD
let Predicates = [HasNEON] in {
// AdvSIMD three register vector instructions
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
RegisterOperand regtype, string asm, string kind,
list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
"|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
RegisterOperand regtype, string asm, string kind,
list<dag> pattern>
: I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
"|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern>
: Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>,
multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVectorPseudo<V64,
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDThreeSameVectorPseudo<V128,
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]>;
def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
(v4i16 V64:$RHS))),
V64:$LHS, V64:$MHS, V64:$RHS)>;
def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
(v2i32 V64:$RHS))),
V64:$LHS, V64:$MHS, V64:$RHS)>;
def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
(v1i64 V64:$RHS))),
V64:$LHS, V64:$MHS, V64:$RHS)>;
def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
(v8i16 V128:$RHS))),
V128:$LHS, V128:$MHS, V128:$RHS)>;
def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
(v4i32 V128:$RHS))),
V128:$LHS, V128:$MHS, V128:$RHS)>;
def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
(v2i64 V128:$RHS))),
V128:$LHS, V128:$MHS, V128:$RHS)>;
// All operand sizes distinguished in the encoding.
multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
asm, ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
multiclass SIMDThreeSameVectorExtraPatterns<string inst, SDPatternOperator OpNode> {
def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)),
(!cast<Instruction>(inst#"v8i8") V64:$LHS, V64:$RHS)>;
def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
(!cast<Instruction>(inst#"v4i16") V64:$LHS, V64:$RHS)>;
def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
(!cast<Instruction>(inst#"v2i32") V64:$LHS, V64:$RHS)>;
def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)),
(!cast<Instruction>(inst#"v16i8") V128:$LHS, V128:$RHS)>;
def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
(!cast<Instruction>(inst#"v8i16") V128:$LHS, V128:$RHS)>;
def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
(!cast<Instruction>(inst#"v4i32") V128:$LHS, V128:$RHS)>;
def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
(!cast<Instruction>(inst#"v2i64") V128:$LHS, V128:$RHS)>;
// As above, but D sized elements unsupported.
multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$dst),
(OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$dst),
(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
// As above, but only B sized elements supported.
multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd),
(OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
// As above, but only floating point elements supported.
multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
asm, ".4h",
[(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
asm, ".8h",
[(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
asm, ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
asm, ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
string asm, SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
asm, ".4h",
[(set (v4f16 V64:$dst),
(OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
asm, ".8h",
[(set (v8f16 V128:$dst),
(OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
asm, ".2s",
[(set (v2f32 V64:$dst),
(OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
asm, ".4s",
[(set (v4f32 V128:$dst),
(OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
asm, ".2d",
[(set (v2f64 V128:$dst),
(OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
// As above, but D and B sized elements unsupported.
multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
asm, ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
asm, ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
asm, ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
asm, ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
// Logical three vector ops share opcode bits, and only use B sized elements.
multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
asm, ".8b",
[(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
asm, ".16b",
[(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
(!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
(!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
(!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
(!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
(!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
(!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
string asm, SDPatternOperator OpNode = null_frag> {
def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
asm, ".8b",
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
asm, ".16b",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]>;
def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
(v4i16 V64:$RHS))),
V64:$LHS, V64:$MHS, V64:$RHS)>;
def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
(v2i32 V64:$RHS))),
V64:$LHS, V64:$MHS, V64:$RHS)>;
def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
(v1i64 V64:$RHS))),
V64:$LHS, V64:$MHS, V64:$RHS)>;
def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
(v8i16 V128:$RHS))),
V128:$LHS, V128:$MHS, V128:$RHS)>;
def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
(v4i32 V128:$RHS))),
V128:$LHS, V128:$MHS, V128:$RHS)>;
def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
(v2i64 V128:$RHS))),
V128:$LHS, V128:$MHS, V128:$RHS)>;
// ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
// bytes from S-sized elements.
class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
string kind2, RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
SDPatternOperator OpNode> :
BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
[(set (AccumType RegType:$dst),
(OpNode (AccumType RegType:$Rd),
(InputType RegType:$Rn),
(InputType RegType:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
v2i32, v8i8, OpNode>;
def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
v4i32, v16i8, OpNode>;
// ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions
// select inputs from 4H vectors and accumulate outputs to a 2S vector (or from
// 8H to 4S, when Q=1).
class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
string kind2, RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
SDPatternOperator OpNode> :
BaseSIMDThreeSameVectorTied<Q, U, size, 0b11101, RegType, asm, kind1,
[(set (AccumType RegType:$dst),
(OpNode (AccumType RegType:$Rd),
(InputType RegType:$Rn),
(InputType RegType:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
let Inst{13} = b13;
multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
SDPatternOperator OpNode> {
def v4f16 : BaseSIMDThreeSameVectorFML<0, U, b13, size, asm, ".2s", ".2h", V64,
v2f32, v4f16, OpNode>;
def v8f16 : BaseSIMDThreeSameVectorFML<1, U, b13, size, asm, ".4s", ".4h", V128,
v4f32, v8f16, OpNode>;
// AdvSIMD two register vector instructions.
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
bits<2> size2, RegisterOperand regtype, string asm,
string dstkind, string srckind, list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"{\t$Rd" # dstkind # ", $Rn" # srckind #
"|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21} = 0b1;
let Inst{20-19} = size2;
let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
bits<2> size2, RegisterOperand regtype,
string asm, string dstkind, string srckind,
list<dag> pattern>
: I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
"{\t$Rd" # dstkind # ", $Rn" # srckind #
"|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21} = 0b1;
let Inst{20-19} = size2;
let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
// Supports B, H, and S element sizes.
multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
RegisterOperand regtype, string asm, string dstkind,
string srckind, string amount>
: I<(outs V128:$Rd), (ins regtype:$Rn), asm,
"{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
"|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29-24} = 0b101110;
let Inst{23-22} = size;
let Inst{21-10} = 0b100001001110;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDVectorLShiftLongBySizeBHS {
let hasSideEffects = 0 in {
def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
"shll", ".8h", ".8b", "8">;
def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
"shll2", ".8h", ".16b", "8">;
def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
"shll", ".4s", ".4h", "16">;
def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
"shll2", ".4s", ".8h", "16">;
def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
"shll", ".2d", ".2s", "32">;
def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
"shll2", ".2d", ".4s", "32">;
// Supports all element sizes.
multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".4h", ".8b",
[(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".8h", ".16b",
[(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".2s", ".4h",
[(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".4s", ".8h",
[(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".1d", ".2s",
[(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".2d", ".4s",
[(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
asm, ".4h", ".8b",
[(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
(v8i8 V64:$Rn)))]>;
def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
asm, ".8h", ".16b",
[(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
(v16i8 V128:$Rn)))]>;
def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
asm, ".2s", ".4h",
[(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
(v4i16 V64:$Rn)))]>;
def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
asm, ".4s", ".8h",
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
(v8i16 V128:$Rn)))]>;
def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
asm, ".1d", ".2s",
[(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
(v2i32 V64:$Rn)))]>;
def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
asm, ".2d", ".4s",
[(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
(v4i32 V128:$Rn)))]>;
// Supports all element sizes, except 1xD.
multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
// Supports only B element sizes.
multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
// Supports only B and H element sizes.
multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
// Supports H, S and D element sizes, uses high bit of the size field
// as an extra opcode bit.
multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
asm, ".4h", ".4h",
[(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
asm, ".8h", ".8h",
[(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
// Supports only S and D element sizes
multiclass SIMDTwoVectorSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v2f32 : BaseSIMDTwoSameVector<0, U, 00, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
def v4f32 : BaseSIMDTwoSameVector<1, U, 00, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
def v2f64 : BaseSIMDTwoSameVector<1, U, 01, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
multiclass FRIntNNTVector<bit U, bit op, string asm,
SDPatternOperator OpNode = null_frag> :
SIMDTwoVectorSD<U, {0b1111,op}, asm, OpNode>;
// Supports only S element size.
multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
asm, ".4h", ".4h",
[(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
asm, ".8h", ".8h",
[(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
asm, ".4s", ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand inreg, RegisterOperand outreg,
string asm, string outkind, string inkind,
list<dag> pattern>
: I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
"{\t$Rd" # outkind # ", $Rn" # inkind #
"|" # outkind # "\t$Rd, $Rn}", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21-17} = 0b10000;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand inreg, RegisterOperand outreg,
string asm, string outkind, string inkind,
list<dag> pattern>
: I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
"{\t$Rd" # outkind # ", $Rn" # inkind #
"|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21-17} = 0b10000;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
asm, ".8b", ".8h",
[(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
asm#"2", ".16b", ".8h", []>;
def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
asm, ".4h", ".4s",
[(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
asm#"2", ".8h", ".4s", []>;
def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
asm, ".2s", ".2d",
[(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
asm#"2", ".4s", ".2d", []>;
def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
(!cast<Instruction>(NAME # "v16i8")
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
(!cast<Instruction>(NAME # "v8i16")
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
(!cast<Instruction>(NAME # "v4i32")
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
bits<5> opcode, RegisterOperand regtype, string asm,
string kind, string zero, ValueType dty,
ValueType sty, SDNode OpNode>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
"|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
[(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21} = 0b1;
let Inst{20-19} = size2;
let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
// Comparisons support all element sizes, except 1xD.
multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
SDNode OpNode> {
def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
asm, ".8b", "0",
v8i8, v8i8, OpNode>;
def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
asm, ".16b", "0",
v16i8, v16i8, OpNode>;
def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
asm, ".4h", "0",
v4i16, v4i16, OpNode>;
def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
asm, ".8h", "0",
v8i16, v8i16, OpNode>;
def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
asm, ".2s", "0",
v2i32, v2i32, OpNode>;
def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
asm, ".4s", "0",
v4i32, v4i32, OpNode>;
def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
asm, ".2d", "0",
v2i64, v2i64, OpNode>;
// FP Comparisons support only S and D element sizes (and H for v8.2a).
multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
string asm, SDNode OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
asm, ".4h", "0.0",
v4i16, v4f16, OpNode>;
def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
asm, ".8h", "0.0",
v8i16, v8f16, OpNode>;
} // Predicates = [HasNEON, HasFullFP16]
def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
asm, ".2s", "0.0",
v2i32, v2f32, OpNode>;
def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
asm, ".4s", "0.0",
v4i32, v4f32, OpNode>;
def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
asm, ".2d", "0.0",
v2i64, v2f64, OpNode>;
let Predicates = [HasNEON, HasFullFP16] in {
def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
(!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
(!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
(!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
let Predicates = [HasNEON, HasFullFP16] in {
def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
def : InstAlias<asm # ".2d\t$Vd, $Vn, #0",
(!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand outtype, RegisterOperand intype,
string asm, string VdTy, string VnTy,
list<dag> pattern>
: I<(outs outtype:$Rd), (ins intype:$Rn), asm,
!strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21-17} = 0b10000;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand outtype, RegisterOperand intype,
string asm, string VdTy, string VnTy,
list<dag> pattern>
: I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
!strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21-17} = 0b10000;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
asm, ".4s", ".4h", []>;
def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
asm#"2", ".4s", ".8h", []>;
def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
asm, ".2d", ".2s", []>;
def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
asm#"2", ".2d", ".4s", []>;
multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
asm, ".4h", ".4s", []>;
def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
asm#"2", ".8h", ".4s", []>;
def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
asm, ".2s", ".2d", []>;
def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
asm#"2", ".4s", ".2d", []>;
multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
Intrinsic OpNode> {
def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
asm, ".2s", ".2d",
[(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
asm#"2", ".4s", ".2d", []>;
def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
(!cast<Instruction>(NAME # "v4f32")
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
// AdvSIMD three register different-size vector instructions.
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
RegisterOperand outtype, RegisterOperand intype1,
RegisterOperand intype2, string asm,
string outkind, string inkind1, string inkind2,
list<dag> pattern>
: I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
"{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
"|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31} = 0;
let Inst{30} = size{0};
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size{2-1};
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = opcode;
let Inst{11-10} = 0b00;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
RegisterOperand outtype, RegisterOperand intype1,
RegisterOperand intype2, string asm,
string outkind, string inkind1, string inkind2,
list<dag> pattern>
: I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
"{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
"|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31} = 0;
let Inst{30} = size{0};
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size{2-1};
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-12} = opcode;
let Inst{11-10} = 0b00;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
// FIXME: TableGen doesn't know how to deal with expanded types that also
// change the element count (in this case, placing the results in
// the high elements of the result register rather than the low
// elements). Until that's fixed, we can't code-gen those.
multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
Intrinsic IntOp> {
def v8i16_v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
V64, V128, V128,
asm, ".8b", ".8h", ".8h",
[(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
def v8i16_v16i8 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".16b", ".8h", ".8h",
def v4i32_v4i16 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
V64, V128, V128,
asm, ".4h", ".4s", ".4s",
[(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
def v4i32_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".8h", ".4s", ".4s",
def v2i64_v2i32 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V64, V128, V128,
asm, ".2s", ".2d", ".2d",
[(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
def v2i64_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".4s", ".2d", ".2d",
// Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
// a version attached to an instruction.
def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
(v8i16 V128:$Rm))),
(!cast<Instruction>(NAME # "v8i16_v16i8")
V128:$Rn, V128:$Rm)>;
def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
(v4i32 V128:$Rm))),
(!cast<Instruction>(NAME # "v4i32_v8i16")
V128:$Rn, V128:$Rm)>;
def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
(v2i64 V128:$Rm))),
(!cast<Instruction>(NAME # "v2i64_v4i32")
V128:$Rn, V128:$Rm)>;
multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
Intrinsic IntOp> {
def v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
V128, V64, V64,
asm, ".8h", ".8b", ".8b",
[(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".8h", ".16b", ".16b", []>;
let Predicates = [HasAES] in {
def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc,
V128, V64, V64,
asm, ".1q", ".1d", ".1d", []>;
def v2i64 : BaseSIMDDifferentThreeVector<U, 0b111, opc,
V128, V128, V128,
asm#"2", ".1q", ".2d", ".2d", []>;
def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
(v8i8 (extract_high_v16i8 V128:$Rm)))),
(!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
[(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 V128:$Rm)))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
[(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 V128:$Rm)))]>;
multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
V128, V64, V64,
asm, ".8h", ".8b", ".8b",
[(set (v8i16 V128:$Rd),
(zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".8h", ".16b", ".16b",
[(set (v8i16 V128:$Rd),
(zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
(extract_high_v16i8 V128:$Rm)))))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
[(set (v4i32 V128:$Rd),
(zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$Rd),
(zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 V128:$Rm)))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
[(set (v2i64 V128:$Rd),
(zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$Rd),
(zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 V128:$Rm)))))]>;
multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
string asm,
SDPatternOperator OpNode> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
V128, V64, V64,
asm, ".8h", ".8b", ".8b",
[(set (v8i16 V128:$dst),
(add (v8i16 V128:$Rd),
(zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".8h", ".16b", ".16b",
[(set (v8i16 V128:$dst),
(add (v8i16 V128:$Rd),
(zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
(extract_high_v16i8 V128:$Rm))))))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
[(set (v4i32 V128:$dst),
(add (v4i32 V128:$Rd),
(zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$dst),
(add (v4i32 V128:$Rd),
(zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 V128:$Rm))))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
[(set (v2i64 V128:$dst),
(add (v2i64 V128:$Rd),
(zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$dst),
(add (v2i64 V128:$Rd),
(zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 V128:$Rm))))))]>;
multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
V128, V64, V64,
asm, ".8h", ".8b", ".8b",
[(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".8h", ".16b", ".16b",
[(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
(extract_high_v16i8 V128:$Rm)))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
[(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 V128:$Rm)))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
[(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 V128:$Rm)))]>;
multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
string asm,
SDPatternOperator OpNode> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
V128, V64, V64,
asm, ".8h", ".8b", ".8b",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".8h", ".16b", ".16b",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd),
(extract_high_v16i8 V128:$Rn),
(extract_high_v16i8 V128:$Rm)))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd),
(extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 V128:$Rm)))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
[(set (v2i64 V128:$dst),
(OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$dst),
(OpNode (v2i64 V128:$Rd),
(extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 V128:$Rm)))]>;
multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
SDPatternOperator Accum> {
def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
V128, V64, V64,
asm, ".4s", ".4h", ".4h",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
(v4i16 V64:$Rm)))))]>;
def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".8h", ".8h",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 V128:$Rm)))))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
V128, V64, V64,
asm, ".2d", ".2s", ".2s",
[(set (v2i64 V128:$dst),
(Accum (v2i64 V128:$Rd),
(v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
(v2i32 V64:$Rm)))))]>;
def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".4s", ".4s",
[(set (v2i64 V128:$dst),
(Accum (v2i64 V128:$Rd),
(v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 V128:$Rm)))))]>;
multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc,
V128, V128, V64,
asm, ".8h", ".8h", ".8b",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc,
V128, V128, V128,
asm#"2", ".8h", ".8h", ".16b",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
(extract_high_v16i8 V128:$Rm)))]>;
def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc,
V128, V128, V64,
asm, ".4s", ".4s", ".4h",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc,
V128, V128, V128,
asm#"2", ".4s", ".4s", ".8h",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
(extract_high_v8i16 V128:$Rm)))]>;
def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc,
V128, V128, V64,
asm, ".2d", ".2d", ".2s",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc,
V128, V128, V128,
asm#"2", ".2d", ".2d", ".4s",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
(extract_high_v4i32 V128:$Rm)))]>;
// AdvSIMD bitwise extract from vector
class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
string asm, string kind>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
"|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
[(set (vty regtype:$Rd),
(AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<4> imm;
let Inst{31} = 0;
let Inst{30} = size;
let Inst{29-21} = 0b101110000;
let Inst{20-16} = Rm;
let Inst{15} = 0;
let Inst{14-11} = imm;
let Inst{10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDBitwiseExtract<string asm> {
def v8i8 : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
let imm{3} = 0;
def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
// AdvSIMD zip vector
class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
string asm, string kind, SDNode OpNode, ValueType valty>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
"|" # kind # "\t$Rd, $Rn, $Rm}", "",
[(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31} = 0;
let Inst{30} = size{0};
let Inst{29-24} = 0b001110;
let Inst{23-22} = size{2-1};
let Inst{21} = 0;
let Inst{20-16} = Rm;
let Inst{15} = 0;
let Inst{14-12} = opc;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDZipVector<bits<3>opc, string asm,
SDNode OpNode> {
def v8i8 : BaseSIMDZipVector<0b000, opc, V64,
asm, ".8b", OpNode, v8i8>;
def v16i8 : BaseSIMDZipVector<0b001, opc, V128,
asm, ".16b", OpNode, v16i8>;
def v4i16 : BaseSIMDZipVector<0b010, opc, V64,
asm, ".4h", OpNode, v4i16>;
def v8i16 : BaseSIMDZipVector<0b011, opc, V128,
asm, ".8h", OpNode, v8i16>;
def v2i32 : BaseSIMDZipVector<0b100, opc, V64,
asm, ".2s", OpNode, v2i32>;
def v4i32 : BaseSIMDZipVector<0b101, opc, V128,
asm, ".4s", OpNode, v4i32>;
def v2i64 : BaseSIMDZipVector<0b111, opc, V128,
asm, ".2d", OpNode, v2i64>;
def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
(!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
(!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
(!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
(!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
(!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
// AdvSIMD three register scalar instructions
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
RegisterClass regtype, string asm,
list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
"\t$Rd, $Rn, $Rm", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-21} = size;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
dag oops, dag iops, string asm,
list<dag> pattern>
: I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
let Inst{21} = R;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
(!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
(ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
asm, []>;
def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
(ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
asm, []>;
multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
let Predicates = [HasNEON, HasFullFP16] in {
def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
[(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
[(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
let Predicates = [HasNEON, HasFullFP16] in {
def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
} // Predicates = [HasNEON, HasFullFP16]
def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
dag oops, dag iops, string asm, string cstr, list<dag> pat>
: I<oops, iops, asm,
"\t$Rd, $Rn, $Rm", cstr, pat>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
let Inst{21} = 1;
let Inst{20-16} = Rm;
let Inst{15-11} = opcode;
let Inst{10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
(outs FPR32:$Rd),
(ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
(outs FPR64:$Rd),
(ins FPR32:$Rn, FPR32:$Rm), asm, "",
[(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
(outs FPR32:$dst),
(ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
asm, "$Rd = $dst", []>;
def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
(outs FPR64:$dst),
(ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
asm, "$Rd = $dst",
[(set (i64 FPR64:$dst),
(OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
// AdvSIMD two register scalar instructions
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
RegisterClass regtype, RegisterClass regtype2,
string asm, list<dag> pat>
: I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
"\t$Rd, $Rn", "", pat>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
let Inst{21} = 0b1;
let Inst{20-19} = size2;
let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
RegisterClass regtype, RegisterClass regtype2,
string asm, list<dag> pat>
: I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
"\t$Rd, $Rn", "$Rd = $dst", pat>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
let Inst{21-17} = 0b10000;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
RegisterClass regtype, string asm, string zero>
: I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
"\t$Rd, $Rn, #" # zero, "", []>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
let Inst{21} = 0b1;
let Inst{20-19} = size2;
let Inst{18-17} = 0b00;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
: I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
[(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-17} = 0b011111100110000;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;
def : Pat<(v1i64 (OpNode FPR64:$Rn)),
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
let Predicates = [HasNEON, HasFullFP16] in {
def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
let Predicates = [HasNEON, HasFullFP16] in {
def : InstAlias<asm # "\t$Rd, $Rn, #0",
(!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
(!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
(!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
let Predicates = [HasNEON, HasFullFP16] in {
def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
[(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
[(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
let Predicates = [HasNEON, HasFullFP16] in {
def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
[(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
[(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
(!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
def v1i64 : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
[(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
def v1i32 : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
[(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
def v1i16 : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
def v1i8 : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
(!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
[(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
// AdvSIMD scalar pairwise instructions
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
RegisterOperand regtype, RegisterOperand vectype,
string asm, string kind>
: I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
"{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-24} = 0b11110;
let Inst{23-22} = size;
let Inst{21-17} = 0b11000;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
asm, ".2d">;
multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
let Predicates = [HasNEON, HasFullFP16] in {
def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
asm, ".2h">;
def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
asm, ".2s">;
def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
asm, ".2d">;
// AdvSIMD across lanes instructions
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterClass regtype, RegisterOperand vectype,
string asm, string kind, list<dag> pattern>
: I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
"{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21-17} = 0b11000;
let Inst{16-12} = opcode;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
string asm> {
def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8, V64,
asm, ".8b", []>;
def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8, V128,
asm, ".16b", []>;
def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
asm, ".4h", []>;
def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
asm, ".8h", []>;
def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
asm, ".4s", []>;
multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
asm, ".8b", []>;
def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
asm, ".16b", []>;
def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
asm, ".4h", []>;
def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
asm, ".8h", []>;
def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
asm, ".4s", []>;
multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
Intrinsic intOp> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
asm, ".4h",
[(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>;
def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
asm, ".8h",
[(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>;
} // Predicates = [HasNEON, HasFullFP16]
def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
asm, ".4s",
[(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
// AdvSIMD INS/DUP instructions
// FIXME: There has got to be a better way to factor these. ugh.
class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
string operands, string constraints, list<dag> pattern>
: I<outs, ins, asm, operands, constraints, pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = op;
let Inst{28-21} = 0b01110000;
let Inst{15} = 0;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
RegisterOperand vecreg, RegisterClass regtype>
: BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
"{\t$Rd" # size # ", $Rn" #
"|" # size # "\t$Rd, $Rn}", "",
[(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
let Inst{20-16} = imm5;
let Inst{14-11} = 0b0001;
class SIMDDupFromElement<bit Q, string dstkind, string srckind,
ValueType vectype, ValueType insreg,
RegisterOperand vecreg, Operand idxtype,
ValueType elttype, SDNode OpNode>
: BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
"{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
"|" # dstkind # "\t$Rd, $Rn$idx}", "",
[(set (vectype vecreg:$Rd),
(OpNode (insreg V128:$Rn), idxtype:$idx))]> {
let Inst{14-11} = 0b0000;
class SIMDDup64FromElement
: SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
VectorIndexD, i64, AArch64duplane64> {
bits<1> idx;
let Inst{20} = idx;
let Inst{19-16} = 0b1000;
class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
RegisterOperand vecreg>
: SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
VectorIndexS, i64, AArch64duplane32> {
bits<2> idx;
let Inst{20-19} = idx;
let Inst{18-16} = 0b100;
class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
RegisterOperand vecreg>
: SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
VectorIndexH, i64, AArch64duplane16> {
bits<3> idx;
let Inst{20-18} = idx;
let Inst{17-16} = 0b10;
class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
RegisterOperand vecreg>
: SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
VectorIndexB, i64, AArch64duplane8> {
bits<4> idx;
let Inst{20-17} = idx;
let Inst{16} = 1;
class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
Operand idxtype, string asm, list<dag> pattern>
: BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
"{\t$Rd, $Rn" # size # "$idx" #
"|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
let Inst{14-11} = imm4;
class SIMDSMov<bit Q, string size, RegisterClass regtype,
Operand idxtype>
: BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
Operand idxtype>
: BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
[(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
class SIMDMovAlias<string asm, string size, Instruction inst,
RegisterClass regtype, Operand idxtype>
: InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
"|" # size # "\t$dst, $src$idx}",
(inst regtype:$dst, V128:$src, idxtype:$idx)>;
multiclass SMov {
def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
bits<4> idx;
let Inst{20-17} = idx;
let Inst{16} = 1;
def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
bits<4> idx;
let Inst{20-17} = idx;
let Inst{16} = 1;
def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
bits<3> idx;
let Inst{20-18} = idx;
let Inst{17-16} = 0b10;
def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
bits<3> idx;
let Inst{20-18} = idx;
let Inst{17-16} = 0b10;
def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
bits<2> idx;
let Inst{20-19} = idx;
let Inst{18-16} = 0b100;
multiclass UMov {
def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
bits<4> idx;
let Inst{20-17} = idx;
let Inst{16} = 1;
def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
bits<3> idx;
let Inst{20-18} = idx;
let Inst{17-16} = 0b10;
def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
bits<2> idx;
let Inst{20-19} = idx;
let Inst{18-16} = 0b100;
def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
bits<1> idx;
let Inst{20} = idx;
let Inst{19-16} = 0b1000;
def : SIMDMovAlias<"mov", ".s",
GPR32, VectorIndexS>;
def : SIMDMovAlias<"mov", ".d",
GPR64, VectorIndexD>;
class SIMDInsFromMain<string size, ValueType vectype,
RegisterClass regtype, Operand idxtype>
: BaseSIMDInsDup<1, 0, (outs V128:$dst),
(ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
"{\t$Rd" # size # "$idx, $Rn" #
"|" # size # "\t$Rd$idx, $Rn}",
"$Rd = $dst",
[(set V128:$dst,
(vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
let Inst{14-11} = 0b0011;
class SIMDInsFromElement<string size, ValueType vectype,
ValueType elttype, Operand idxtype>
: BaseSIMDInsDup<1, 1, (outs V128:$dst),
(ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
"{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
"|" # size # "\t$Rd$idx, $Rn$idx2}",
"$Rd = $dst",
[(set V128:$dst,
(vectype V128:$Rd),
(elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
class SIMDInsMainMovAlias<string size, Instruction inst,
RegisterClass regtype, Operand idxtype>
: InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
"|" # size #"\t$dst$idx, $src}",
(inst V128:$dst, idxtype:$idx, regtype:$src)>;
class SIMDInsElementMovAlias<string size, Instruction inst,
Operand idxtype>
: InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2"
# "|" # size #"\t$dst$idx, $src$idx2}",
(inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
multiclass SIMDIns {
def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
bits<4> idx;
let Inst{20-17} = idx;
let Inst{16} = 1;
def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
bits<3> idx;
let Inst{20-18} = idx;
let Inst{17-16} = 0b10;
def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
bits<2> idx;
let Inst{20-19} = idx;
let Inst{18-16} = 0b100;
def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
bits<1> idx;
let Inst{20} = idx;
let Inst{19-16} = 0b1000;
def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
bits<4> idx;
bits<4> idx2;
let Inst{20-17} = idx;
let Inst{16} = 1;
let Inst{14-11} = idx2;
def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
bits<3> idx;
bits<3> idx2;
let Inst{20-18} = idx;
let Inst{17-16} = 0b10;
let Inst{14-12} = idx2;
let Inst{11} = {?};
def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
bits<2> idx;
bits<2> idx2;
let Inst{20-19} = idx;
let Inst{18-16} = 0b100;
let Inst{14-13} = idx2;
let Inst{12-11} = {?,?};
def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
bits<1> idx;
bits<1> idx2;
let Inst{20} = idx;
let Inst{19-16} = 0b1000;
let Inst{14} = idx2;
let Inst{13-11} = {?,?,?};
// For all forms of the INS instruction, the "mov" mnemonic is the
// preferred alias. Why they didn't just call the instruction "mov" in
// the first place is a very good question indeed...
def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
GPR32, VectorIndexB>;
def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
GPR32, VectorIndexH>;
def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
GPR32, VectorIndexS>;
def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
GPR64, VectorIndexD>;
def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
RegisterOperand listtype, string asm, string kind>
: I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
"\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
Sched<[WriteV]> {
bits<5> Vd;
bits<5> Vn;
bits<5> Vm;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29-21} = 0b001110000;
let Inst{20-16} = Vm;
let Inst{15} = 0;
let Inst{14-13} = len;
let Inst{12} = op;
let Inst{11-10} = 0b00;
let Inst{9-5} = Vn;
let Inst{4-0} = Vd;
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
RegisterOperand listtype, string asm, string kind>
: I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
"\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
Sched<[WriteV]> {
bits<5> Vd;
bits<5> Vn;
bits<5> Vm;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29-21} = 0b001110000;
let Inst{20-16} = Vm;
let Inst{15} = 0;
let Inst{14-13} = len;
let Inst{12} = op;
let Inst{11-10} = 0b00;
let Inst{9-5} = Vn;
let Inst{4-0} = Vd;
class SIMDTableLookupAlias<string asm, Instruction inst,
RegisterOperand vectype, RegisterOperand listtype>
: InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
(inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
multiclass SIMDTableLookup<bit op, string asm> {
def v8i8One : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
asm, ".8b">;
def v8i8Two : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
asm, ".8b">;
def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
asm, ".8b">;
def v8i8Four : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
asm, ".8b">;
def v16i8One : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
asm, ".16b">;
def v16i8Two : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
asm, ".16b">;
def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
asm, ".16b">;
def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
asm, ".16b">;
def : SIMDTableLookupAlias<asm # ".8b",
V64, VecListOne128>;
def : SIMDTableLookupAlias<asm # ".8b",
V64, VecListTwo128>;
def : SIMDTableLookupAlias<asm # ".8b",
V64, VecListThree128>;
def : SIMDTableLookupAlias<asm # ".8b",
V64, VecListFour128>;
def : SIMDTableLookupAlias<asm # ".16b",
V128, VecListOne128>;
def : SIMDTableLookupAlias<asm # ".16b",
V128, VecListTwo128>;
def : SIMDTableLookupAlias<asm # ".16b",
V128, VecListThree128>;
def : SIMDTableLookupAlias<asm # ".16b",
V128, VecListFour128>;
multiclass SIMDTableLookupTied<bit op, string asm> {
def v8i8One : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
asm, ".8b">;
def v8i8Two : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
asm, ".8b">;
def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
asm, ".8b">;
def v8i8Four : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
asm, ".8b">;
def v16i8One : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
asm, ".16b">;
def v16i8Two : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
asm, ".16b">;
def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
asm, ".16b">;
def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
asm, ".16b">;
def : SIMDTableLookupAlias<asm # ".8b",
V64, VecListOne128>;
def : SIMDTableLookupAlias<asm # ".8b",
V64, VecListTwo128>;
def : SIMDTableLookupAlias<asm # ".8b",
V64, VecListThree128>;
def : SIMDTableLookupAlias<asm # ".8b",
V64, VecListFour128>;
def : SIMDTableLookupAlias<asm # ".16b",
V128, VecListOne128>;
def : SIMDTableLookupAlias<asm # ".16b",
V128, VecListTwo128>;
def : SIMDTableLookupAlias<asm # ".16b",
V128, VecListThree128>;
def : SIMDTableLookupAlias<asm # ".16b",
V128, VecListFour128>;
// AdvSIMD scalar CPY
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
string kind, Operand idxtype>
: I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
"{\t$dst, $src" # kind # "$idx" #
"|\t$dst, $src$idx}", "", []>,
Sched<[WriteV]> {
bits<5> dst;
bits<5> src;
let Inst{31-21} = 0b01011110000;
let Inst{15-10} = 0b000001;
let Inst{9-5} = src;
let Inst{4-0} = dst;
class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
: InstAlias<asm # "{\t$dst, $src" # size # "$index"
# "|\t$dst, $src$index}",
(inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
multiclass SIMDScalarCPY<string asm> {
def i8 : BaseSIMDScalarCPY<FPR8, V128, ".b", VectorIndexB> {
bits<4> idx;
let Inst{20-17} = idx;
let Inst{16} = 1;
def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
bits<3> idx;
let Inst{20-18} = idx;
let Inst{17-16} = 0b10;
def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
bits<2> idx;
let Inst{20-19} = idx;
let Inst{18-16} = 0b100;
def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
bits<1> idx;
let Inst{20} = idx;
let Inst{19-16} = 0b1000;
def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
(!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
// 'DUP' mnemonic aliases.
def : SIMDScalarCPYAlias<"dup", ".b",
FPR8, V128, VectorIndexB>;
def : SIMDScalarCPYAlias<"dup", ".h",
FPR16, V128, VectorIndexH>;
def : SIMDScalarCPYAlias<"dup", ".s",
FPR32, V128, VectorIndexS>;
def : SIMDScalarCPYAlias<"dup", ".d",
FPR64, V128, VectorIndexD>;
// AdvSIMD modified immediate instructions
class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
string asm, string op_string,
string cstr, list<dag> pattern>
: I<oops, iops, asm, op_string, cstr, pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<8> imm8;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = op;
let Inst{28-19} = 0b0111100000;
let Inst{18-16} = imm8{7-5};
let Inst{11} = op2;
let Inst{10} = 1;
let Inst{9-5} = imm8{4-0};
let Inst{4-0} = Rd;
class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
Operand immtype, dag opt_shift_iop,
string opt_shift, string asm, string kind,
list<dag> pattern>
: BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
!con((ins immtype:$imm8), opt_shift_iop), asm,
"{\t$Rd" # kind # ", $imm8" # opt_shift #
"|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
"", pattern> {
let DecoderMethod = "DecodeModImmInstruction";
class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
Operand immtype, dag opt_shift_iop,
string opt_shift, string asm, string kind,
list<dag> pattern>
: BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
!con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
"|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
"$Rd = $dst", pattern> {
let DecoderMethod = "DecodeModImmTiedInstruction";
class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
: BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins logical_vec_shift:$shift),
"$shift", asm, kind, pattern> {
bits<2> shift;
let Inst{15} = b15_b12{1};
let Inst{14-13} = shift;
let Inst{12} = b15_b12{0};
class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
: BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
(ins logical_vec_shift:$shift),
"$shift", asm, kind, pattern> {
bits<2> shift;
let Inst{15} = b15_b12{1};
let Inst{14-13} = shift;
let Inst{12} = b15_b12{0};
class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
: BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins logical_vec_hw_shift:$shift),
"$shift", asm, kind, pattern> {
bits<2> shift;
let Inst{15} = b15_b12{1};
let Inst{14} = 0;
let Inst{13} = shift{0};
let Inst{12} = b15_b12{0};
class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
: BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
(ins logical_vec_hw_shift:$shift),
"$shift", asm, kind, pattern> {
bits<2> shift;
let Inst{15} = b15_b12{1};
let Inst{14} = 0;
let Inst{13} = shift{0};
let Inst{12} = b15_b12{0};
multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
string asm> {
def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
asm, ".4h", []>;
def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
asm, ".8h", []>;
def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
asm, ".2s", []>;
def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
asm, ".4s", []>;
multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
bits<2> w_cmode, string asm,
SDNode OpNode> {
def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
asm, ".4h",
[(set (v4i16 V64:$dst), (OpNode V64:$Rd,
(i32 imm:$shift)))]>;
def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
asm, ".8h",
[(set (v8i16 V128:$dst), (OpNode V128:$Rd,
(i32 imm:$shift)))]>;
def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
asm, ".2s",
[(set (v2i32 V64:$dst), (OpNode V64:$Rd,
(i32 imm:$shift)))]>;
def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
asm, ".4s",
[(set (v4i32 V128:$dst), (OpNode V128:$Rd,
(i32 imm:$shift)))]>;
class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
RegisterOperand vectype, string asm,
string kind, list<dag> pattern>
: BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
(ins move_vec_shift:$shift),
"$shift", asm, kind, pattern> {
bits<1> shift;
let Inst{15-13} = cmode{3-1};
let Inst{12} = shift;
class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
RegisterOperand vectype,
Operand imm_type, string asm,
string kind, list<dag> pattern>
: BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
asm, kind, pattern> {
let Inst{15-12} = cmode;
class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
list<dag> pattern>
: BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
"\t$Rd, $imm8", "", pattern> {
let Inst{15-12} = cmode;
let DecoderMethod = "DecodeModImmInstruction";
// AdvSIMD indexed element
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
RegisterOperand dst_reg, RegisterOperand lhs_reg,
RegisterOperand rhs_reg, Operand vec_idx, string asm,
string apple_kind, string dst_kind, string lhs_kind,
string rhs_kind, list<dag> pattern>
: I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
"{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
"|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28} = Scalar;
let Inst{27-24} = 0b1111;
let Inst{23-22} = size;
// Bit 21 must be set by the derived class.
let Inst{20-16} = Rm;
let Inst{15-12} = opc;
// Bit 11 must be set by the derived class.
let Inst{10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
RegisterOperand dst_reg, RegisterOperand lhs_reg,
RegisterOperand rhs_reg, Operand vec_idx, string asm,
string apple_kind, string dst_kind, string lhs_kind,
string rhs_kind, list<dag> pattern>
: I<(outs dst_reg:$dst),
(ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
"{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
"|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28} = Scalar;
let Inst{27-24} = 0b1111;
let Inst{23-22} = size;
// Bit 21 must be set by the derived class.
let Inst{20-16} = Rm;
let Inst{15-12} = opc;
// Bit 11 must be set by the derived class.
let Inst{10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
// Armv8.6 BFloat16 Extension
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in {
class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
string kind2, RegisterOperand RegType,
ValueType AccumType, ValueType InputType>
: BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, [(set (AccumType RegType:$dst),
(int_aarch64_neon_bfdot (AccumType RegType:$Rd),
(InputType RegType:$Rn),
(InputType RegType:$Rm)))]> {
let AsmString = !strconcat(asm,
"{\t$Rd" # kind1 # ", $Rn" # kind2 #
", $Rm" # kind2 # "}");
multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
v2f32, v8i8>;
def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
v4f32, v16i8>;
class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
string dst_kind, string lhs_kind,
string rhs_kind,
RegisterOperand RegType,
ValueType AccumType,
ValueType InputType>
: BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,
RegType, RegType, V128, VectorIndexS,
asm, "", dst_kind, lhs_kind, rhs_kind,
[(set (AccumType RegType:$dst),
(AccumType (int_aarch64_neon_bfdot
(AccumType RegType:$Rd),
(InputType RegType:$Rn),
(InputType (bitconvert (AccumType
(AArch64duplane32 (v4f32 V128:$Rm),
VectorIndexH:$idx)))))))]> {
bits<2> idx;
let Inst{21} = idx{0}; // L
let Inst{11} = idx{1}; // H
multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
".2h", V64, v2f32, v8i8>;
def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
".2h", V128, v4f32, v16i8>;
class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
: BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
: I<(outs V128:$dst),
(ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
"{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
[(set (v4f32 V128:$dst),
(v4f32 (OpNode (v4f32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 (bitconvert (v8bf16
(AArch64duplane16 (v8bf16 V128_lo:$Rm),
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<4> Rm;
bits<3> idx;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29-22} = 0b00111111;
let Inst{21-20} = idx{1-0};
let Inst{19-16} = Rm;
let Inst{15-12} = 0b1111;
let Inst{11} = idx{2}; // H
let Inst{10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class SIMDThreeSameVectorBF16MatrixMul<string asm>
: BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,
V128, asm, ".4s",
[(set (v4f32 V128:$dst),
(int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
", $Rm", ".8h", "}");
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
"bfcvtn", ".4h", ".4s",
[(set (v8bf16 V128:$Rd),
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
"bfcvtn2", ".8h", ".4s",
[(set (v8bf16 V128:$dst),
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
class BF16ToSinglePrecision<string asm>
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31-10} = 0b0001111001100011010000;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
// Armv8.6 Matrix Multiply Extension
class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
: BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
[(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
(v16i8 V128:$Rn),
(v16i8 V128:$Rm)))]> {
let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
// ARMv8.2-A Dot Product Instructions (Indexed)
class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
string dst_kind, string lhs_kind, string rhs_kind,
RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
SDPatternOperator OpNode> :
BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
[(set (AccumType RegType:$dst),
(AccumType (OpNode (AccumType RegType:$Rd),
(InputType RegType:$Rn),
(InputType (bitconvert (AccumType
(AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx)))))))]> {
bits<2> idx;
let Inst{21} = idx{0}; // L
let Inst{11} = idx{1}; // H
multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
SDPatternOperator OpNode> {
def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
V64, v2i32, v8i8, OpNode>;
def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
V128, v4i32, v16i8, OpNode>;
// ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
string dst_kind, string lhs_kind,
string rhs_kind, RegisterOperand RegType,
ValueType AccumType, ValueType InputType,
SDPatternOperator OpNode> :
BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, RegType, RegType, V128,
VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
[(set (AccumType RegType:$dst),
(AccumType (OpNode (AccumType RegType:$Rd),
(InputType RegType:$Rn),
(InputType (AArch64duplane16 (v8f16 V128:$Rm),
VectorIndexH:$idx)))))]> {
// idx = H:L:M
bits<3> idx;
let Inst{11} = idx{2}; // H
let Inst{21} = idx{1}; // L
let Inst{20} = idx{0}; // M
multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4f16 : BaseSIMDThreeSameVectorFMLIndex<0, U, opc, asm, ".2s", ".2h", ".h",
V64, v2f32, v4f16, OpNode>;
def v8f16 : BaseSIMDThreeSameVectorFMLIndex<1, U, opc, asm, ".4s", ".4h", ".h",
V128, v4f32, v8f16, OpNode>;
multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
V64, V64,
V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4f16 V64:$Rd),
(OpNode (v4f16 V64:$Rn),
(v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
V128, V128,
V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8f16 V128:$Rd),
(OpNode (v8f16 V128:$Rn),
(v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
} // Predicates = [HasNEON, HasFullFP16]
def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
V64, V64,
V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2f32 V64:$Rd),
(OpNode (v2f32 V64:$Rn),
(v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4f32 V128:$Rd),
(OpNode (v4f32 V128:$Rn),
(v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
V128, V128,
V128, VectorIndexD,
asm, ".2d", ".2d", ".2d", ".d",
[(set (v2f64 V128:$Rd),
(OpNode (v2f64 V128:$Rn),
(v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
bits<1> idx;
let Inst{11} = idx{0};
let Inst{21} = 0;
let Predicates = [HasNEON, HasFullFP16] in {
def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
FPR16Op, FPR16Op, V128_lo, VectorIndexH,
asm, ".h", "", "", ".h",
[(set (f16 FPR16Op:$Rd),
(OpNode (f16 FPR16Op:$Rn),
(f16 (vector_extract (v8f16 V128_lo:$Rm),
VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
} // Predicates = [HasNEON, HasFullFP16]
def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (f32 FPR32Op:$Rd),
(OpNode (f32 FPR32Op:$Rn),
(f32 (vector_extract (v4f32 V128:$Rm),
VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
FPR64Op, FPR64Op, V128, VectorIndexD,
asm, ".d", "", "", ".d",
[(set (f64 FPR64Op:$Rd),
(OpNode (f64 FPR64Op:$Rn),
(f64 (vector_extract (v2f64 V128:$Rm),
VectorIndexD:$idx))))]> {
bits<1> idx;
let Inst{11} = idx{0};
let Inst{21} = 0;
multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
// Patterns for f16: DUPLANE, DUP scalar and vector_extract.
def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
(AArch64duplane16 (v8f16 V128_lo:$Rm),
(!cast<Instruction>(INST # "v8i16_indexed")
V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>;
def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
(AArch64dup (f16 FPR16Op_lo:$Rm)))),
(!cast<Instruction>(INST # "v8i16_indexed") V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;
def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
(AArch64duplane16 (v8f16 V128_lo:$Rm),
(!cast<Instruction>(INST # "v4i16_indexed")
V64:$Rd, V64:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>;
def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
(AArch64dup (f16 FPR16Op_lo:$Rm)))),
(!cast<Instruction>(INST # "v4i16_indexed") V64:$Rd, V64:$Rn,
(SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;
def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn),
(vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
(!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
V128_lo:$Rm, VectorIndexH:$idx)>;
} // Predicates = [HasNEON, HasFullFP16]
// 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(AArch64duplane32 (v4f32 V128:$Rm),
(!cast<Instruction>(INST # v2i32_indexed)
V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
(AArch64dup (f32 FPR32Op:$Rm)))),
(!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
// 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
(AArch64duplane32 (v4f32 V128:$Rm),
(!cast<Instruction>(INST # "v4i32_indexed")
V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
(AArch64dup (f32 FPR32Op:$Rm)))),
(!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
// 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
(AArch64duplane64 (v2f64 V128:$Rm),
(!cast<Instruction>(INST # "v2i64_indexed")
V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
(AArch64dup (f64 FPR64Op:$Rm)))),
(!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
(SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
// Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s
def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
(vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
(!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
V128:$Rm, VectorIndexS:$idx)>;
// 1 variant for 64-bit scalar version: extract from .1d or from .2d
def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
(vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
(!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
V128:$Rm, VectorIndexD:$idx)>;
multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h", []> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
V128, V128,
V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h", []> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
} // Predicates = [HasNEON, HasFullFP16]
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s", []> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s", []> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
V128, V128,
V128, VectorIndexD,
asm, ".2d", ".2d", ".2d", ".d", []> {
bits<1> idx;
let Inst{11} = idx{0};
let Inst{21} = 0;
let Predicates = [HasNEON, HasFullFP16] in {
def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
FPR16Op, FPR16Op, V128_lo, VectorIndexH,
asm, ".h", "", "", ".h", []> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
} // Predicates = [HasNEON, HasFullFP16]
def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s", []> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
FPR64Op, FPR64Op, V128, VectorIndexD,
asm, ".d", "", "", ".d", []> {
bits<1> idx;
let Inst{11} = idx{0};
let Inst{21} = 0;
multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
SDPatternOperator OpNodeLaneQ> {
def : Pat<(v4i16 (OpNodeLane
(v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
(!cast<Instruction>(NAME # v4i16_indexed) $Rn,
(SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
(UImmS1XForm $idx))>;
def : Pat<(v4i16 (OpNodeLaneQ
(v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
(!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v8i16 (OpNodeLane
(v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
(!cast<Instruction>(NAME # v8i16_indexed) $Rn,
(SUBREG_TO_REG (i32 0), $Rm, dsub),
(UImmS1XForm $idx))>;
def : Pat<(v8i16 (OpNodeLaneQ
(v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
(!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v2i32 (OpNodeLane
(v2i32 V64:$Rn), (v2i32 V64:$Rm),
(!cast<Instruction>(NAME # v2i32_indexed) $Rn,
(SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
(UImmS1XForm $idx))>;
def : Pat<(v2i32 (OpNodeLaneQ
(v2i32 V64:$Rn), (v4i32 V128:$Rm),
(!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
def : Pat<(v4i32 (OpNodeLane
(v4i32 V128:$Rn), (v2i32 V64:$Rm),
(!cast<Instruction>(NAME # v4i32_indexed) $Rn,
(SUBREG_TO_REG (i32 0), $Rm, dsub),
(UImmS1XForm $idx))>;
def : Pat<(v4i32 (OpNodeLaneQ
(v4i32 V128:$Rn),
(v4i32 V128:$Rm),
(!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
(UImmS1XForm $idx))>;
multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4i16 V64:$Rd),
(OpNode (v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
V128, V128,
V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8i16 V128:$Rd),
(OpNode (v8i16 V128:$Rn),
(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
V64, V64,
V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2i32 V64:$Rd),
(OpNode (v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4i32 V128:$Rd),
(OpNode (v4i32 V128:$Rn),
(v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
FPR16Op, FPR16Op, V128_lo, VectorIndexH,
asm, ".h", "", "", ".h", []> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i32 FPR32Op:$Rd),
(OpNode FPR32Op:$Rn,
(i32 (vector_extract (v4i32 V128:$Rm),
VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
V64, V64,
V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4i16 V64:$Rd),
(OpNode (v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
V128, V128,
V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8i16 V128:$Rd),
(OpNode (v8i16 V128:$Rn),
(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
V64, V64,
V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2i32 V64:$Rd),
(OpNode (v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4i32 V128:$Rd),
(OpNode (v4i32 V128:$Rn),
(v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4i16 V64:$dst),
(OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
V128, V128,
V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
V64, V64,
V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2i32 V64:$dst),
(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
(v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
V128, V64,
V128_lo, VectorIndexH,
asm, ".4s", ".4s", ".4h", ".h",
[(set (v4i32 V128:$Rd),
(OpNode (v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
V128, V128,
V128_lo, VectorIndexH,
asm#"2", ".4s", ".4s", ".8h", ".h",
[(set (v4i32 V128:$Rd),
(OpNode (extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
V128, V64,
V128, VectorIndexS,
asm, ".2d", ".2d", ".2s", ".s",
[(set (v2i64 V128:$Rd),
(OpNode (v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm#"2", ".2d", ".2d", ".4s", ".s",
[(set (v2i64 V128:$Rd),
(OpNode (extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
FPR32Op, FPR16Op, V128_lo, VectorIndexH,
asm, ".h", "", "", ".h", []> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
FPR64Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s", []> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
SDPatternOperator Accum> {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
V128, V64,
V128_lo, VectorIndexH,
asm, ".4s", ".4s", ".4h", ".h",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqdmull
(v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
// FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
// intermediate EXTRACT_SUBREG would be untyped.
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
(i32 (vector_extract (v4i32
(int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
(i64 0))))),
(!cast<Instruction>(NAME # v4i16_indexed)
(SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
V128_lo:$Rm, VectorIndexH:$idx),
def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
V128, V128,
V128_lo, VectorIndexH,
asm#"2", ".4s", ".4s", ".8h", ".h",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqdmull
(extract_high_v8i16 V128:$Rn),
(AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
V128, V64,
V128, VectorIndexS,
asm, ".2d", ".2d", ".2s", ".s",
[(set (v2i64 V128:$dst),
(Accum (v2i64 V128:$Rd),
(v2i64 (int_aarch64_neon_sqdmull
(v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm#"2", ".2d", ".2d", ".4s", ".s",
[(set (v2i64 V128:$dst),
(Accum (v2i64 V128:$Rd),
(v2i64 (int_aarch64_neon_sqdmull
(extract_high_v4i32 V128:$Rn),
(AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
FPR32Op, FPR16Op, V128_lo, VectorIndexH,
asm, ".h", "", "", ".h", []> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR64Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i64 FPR64Op:$dst),
(Accum (i64 FPR64Op:$Rd),
(i64 (int_aarch64_neon_sqdmulls_scalar
(i32 FPR32Op:$Rn),
(i32 (vector_extract (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
V128, V64,
V128_lo, VectorIndexH,
asm, ".4s", ".4s", ".4h", ".h",
[(set (v4i32 V128:$Rd),
(OpNode (v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
V128, V128,
V128_lo, VectorIndexH,
asm#"2", ".4s", ".4s", ".8h", ".h",
[(set (v4i32 V128:$Rd),
(OpNode (extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
V128, V64,
V128, VectorIndexS,
asm, ".2d", ".2d", ".2s", ".s",
[(set (v2i64 V128:$Rd),
(OpNode (v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm#"2", ".2d", ".2d", ".4s", ".s",
[(set (v2i64 V128:$Rd),
(OpNode (extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
V128, V64,
V128_lo, VectorIndexH,
asm, ".4s", ".4s", ".4h", ".h",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
V128, V128,
V128_lo, VectorIndexH,
asm#"2", ".4s", ".4s", ".8h", ".h",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd),
(extract_high_v8i16 V128:$Rn),
(extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
V128, V64,
V128, VectorIndexS,
asm, ".2d", ".2d", ".2s", ".s",
[(set (v2i64 V128:$dst),
(OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
V128, V128,
V128, VectorIndexS,
asm#"2", ".2d", ".2d", ".4s", ".s",
[(set (v2i64 V128:$dst),
(OpNode (v2i64 V128:$Rd),
(extract_high_v4i32 V128:$Rn),
(extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
// AdvSIMD scalar shift by immediate
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
RegisterClass regtype1, RegisterClass regtype2,
Operand immtype, string asm, list<dag> pattern>
: I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
asm, "\t$Rd, $Rn, $imm", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<7> imm;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-23} = 0b111110;
let Inst{22-16} = fixed_imm;
let Inst{15-11} = opc;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
RegisterClass regtype1, RegisterClass regtype2,
Operand immtype, string asm, list<dag> pattern>
: I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<7> imm;
let Inst{31-30} = 0b01;
let Inst{29} = U;
let Inst{28-23} = 0b111110;
let Inst{22-16} = fixed_imm;
let Inst{15-11} = opc;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
let Predicates = [HasNEON, HasFullFP16] in {
def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
FPR16, FPR16, vecshiftR16, asm, []> {
let Inst{19-16} = imm{3-0};
} // Predicates = [HasNEON, HasFullFP16]
def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
FPR32, FPR32, vecshiftR32, asm, []> {
let Inst{20-16} = imm{4-0};
def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftR64, asm, []> {
let Inst{21-16} = imm{5-0};
multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftR64, asm,
[(set (i64 FPR64:$Rd),
(OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
let Inst{21-16} = imm{5-0};
def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
(!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftR64, asm,
[(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
(i32 vecshiftR64:$imm)))]> {
let Inst{21-16} = imm{5-0};
def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftR64:$imm))),
(!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftL64, asm,
[(set (v1i64 FPR64:$Rd),
(OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
let Inst{21-16} = imm{5-0};
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftL64, asm, []> {
let Inst{21-16} = imm{5-0};
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
FPR8, FPR16, vecshiftR8, asm, []> {
let Inst{18-16} = imm{2-0};
def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
FPR16, FPR32, vecshiftR16, asm, []> {
let Inst{19-16} = imm{3-0};
def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
FPR32, FPR64, vecshiftR32, asm,
[(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
let Inst{20-16} = imm{4-0};
multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
FPR8, FPR8, vecshiftL8, asm, []> {
let Inst{18-16} = imm{2-0};
def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
FPR16, FPR16, vecshiftL16, asm, []> {
let Inst{19-16} = imm{3-0};
def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
FPR32, FPR32, vecshiftL32, asm,
[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
let Inst{20-16} = imm{4-0};
def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftL64, asm,
[(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
let Inst{21-16} = imm{5-0};
def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
(!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
FPR8, FPR8, vecshiftR8, asm, []> {
let Inst{18-16} = imm{2-0};
def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
FPR16, FPR16, vecshiftR16, asm, []> {
let Inst{19-16} = imm{3-0};
def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
FPR32, FPR32, vecshiftR32, asm, []> {
let Inst{20-16} = imm{4-0};
def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
FPR64, FPR64, vecshiftR64, asm, []> {
let Inst{21-16} = imm{5-0};
// AdvSIMD vector x indexed element
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
RegisterOperand dst_reg, RegisterOperand src_reg,
Operand immtype,
string asm, string dst_kind, string src_kind,
list<dag> pattern>
: I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
"|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-23} = 0b011110;
let Inst{22-16} = fixed_imm;
let Inst{15-11} = opc;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
RegisterOperand vectype1, RegisterOperand vectype2,
Operand immtype,
string asm, string dst_kind, string src_kind,
list<dag> pattern>
: I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
"|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-23} = 0b011110;
let Inst{22-16} = fixed_imm;
let Inst{15-11} = opc;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
V64, V64, vecshiftR16,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
V128, V128, vecshiftR16,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
} // Predicates = [HasNEON, HasFullFP16]
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
V128, V128, vecshiftR32,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
V128, V128, vecshiftR64,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
bits<6> imm;
let Inst{21-16} = imm;
multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
Intrinsic OpNode> {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
V64, V64, vecshiftR16,
asm, ".4h", ".4h",
[(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
V128, V128, vecshiftR16,
asm, ".8h", ".8h",
[(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
} // Predicates = [HasNEON, HasFullFP16]
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
asm, ".2s", ".2s",
[(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
V128, V128, vecshiftR32,
asm, ".4s", ".4s",
[(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
V128, V128, vecshiftR64,
asm, ".2d", ".2d",
[(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
bits<6> imm;
let Inst{21-16} = imm;
multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
V64, V128, vecshiftR16Narrow,
asm, ".8b", ".8h",
[(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
V128, V128, vecshiftR16Narrow,
asm#"2", ".16b", ".8h", []> {
bits<3> imm;
let Inst{18-16} = imm;
let hasSideEffects = 0;
def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
V64, V128, vecshiftR32Narrow,
asm, ".4h", ".4s",
[(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
V128, V128, vecshiftR32Narrow,
asm#"2", ".8h", ".4s", []> {
bits<4> imm;
let Inst{19-16} = imm;
let hasSideEffects = 0;
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V128, vecshiftR64Narrow,
asm, ".2s", ".2d",
[(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
V128, V128, vecshiftR64Narrow,
asm#"2", ".4s", ".2d", []> {
bits<5> imm;
let Inst{20-16} = imm;
let hasSideEffects = 0;
// TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
// themselves, so put them here instead.
// Patterns involving what's effectively an insert high and a normal
// intrinsic, represented by CONCAT_VECTORS.
def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
(!cast<Instruction>(NAME # "v16i8_shift")
V128:$Rn, vecshiftR16Narrow:$imm)>;
def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
(!cast<Instruction>(NAME # "v8i16_shift")
V128:$Rn, vecshiftR32Narrow:$imm)>;
def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
(!cast<Instruction>(NAME # "v4i32_shift")
V128:$Rn, vecshiftR64Narrow:$imm)>;
multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
V64, V64, vecshiftL8,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
(i32 vecshiftL8:$imm)))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
V128, V128, vecshiftL8,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
(i32 vecshiftL8:$imm)))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
V64, V64, vecshiftL16,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
(i32 vecshiftL16:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
V128, V128, vecshiftL16,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
(i32 vecshiftL16:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftL32,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
(i32 vecshiftL32:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
V128, V128, vecshiftL32,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
(i32 vecshiftL32:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
V128, V128, vecshiftL64,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
(i32 vecshiftL64:$imm)))]> {
bits<6> imm;
let Inst{21-16} = imm;
multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
V64, V64, vecshiftR8,
asm, ".8b", ".8b",
[(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
(i32 vecshiftR8:$imm)))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
V128, V128, vecshiftR8,
asm, ".16b", ".16b",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
(i32 vecshiftR8:$imm)))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
V64, V64, vecshiftR16,
asm, ".4h", ".4h",
[(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
(i32 vecshiftR16:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
V128, V128, vecshiftR16,
asm, ".8h", ".8h",
[(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
(i32 vecshiftR16:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32,
asm, ".2s", ".2s",
[(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
(i32 vecshiftR32:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
V128, V128, vecshiftR32,
asm, ".4s", ".4s",
[(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
(i32 vecshiftR32:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
V128, V128, vecshiftR64,
asm, ".2d", ".2d",
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
(i32 vecshiftR64:$imm)))]> {
bits<6> imm;
let Inst{21-16} = imm;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
V64, V64, vecshiftR8, asm, ".8b", ".8b",
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
(i32 vecshiftR8:$imm)))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
V128, V128, vecshiftR8, asm, ".16b", ".16b",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
(i32 vecshiftR8:$imm)))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
V64, V64, vecshiftR16, asm, ".4h", ".4h",
[(set (v4i16 V64:$dst),
(OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
(i32 vecshiftR16:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
V128, V128, vecshiftR16, asm, ".8h", ".8h",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
(i32 vecshiftR16:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftR32, asm, ".2s", ".2s",
[(set (v2i32 V64:$dst),
(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
(i32 vecshiftR32:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
V128, V128, vecshiftR32, asm, ".4s", ".4s",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
(i32 vecshiftR32:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
V128, V128, vecshiftR64,
asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
(OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
(i32 vecshiftR64:$imm)))]> {
bits<6> imm;
let Inst{21-16} = imm;
multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
V64, V64, vecshiftL8,
asm, ".8b", ".8b",
[(set (v8i8 V64:$dst),
(OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
(i32 vecshiftL8:$imm)))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
V128, V128, vecshiftL8,
asm, ".16b", ".16b",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
(i32 vecshiftL8:$imm)))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
V64, V64, vecshiftL16,
asm, ".4h", ".4h",
[(set (v4i16 V64:$dst),
(OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
(i32 vecshiftL16:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
V128, V128, vecshiftL16,
asm, ".8h", ".8h",
[(set (v8i16 V128:$dst),
(OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
(i32 vecshiftL16:$imm)))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
V64, V64, vecshiftL32,
asm, ".2s", ".2s",
[(set (v2i32 V64:$dst),
(OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
(i32 vecshiftL32:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
V128, V128, vecshiftL32,
asm, ".4s", ".4s",
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
(i32 vecshiftL32:$imm)))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
V128, V128, vecshiftL64,
asm, ".2d", ".2d",
[(set (v2i64 V128:$dst),
(OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
(i32 vecshiftL64:$imm)))]> {
bits<6> imm;
let Inst{21-16} = imm;
multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
V128, V64, vecshiftL8, asm, ".8h", ".8b",
[(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
V128, V128, vecshiftL8,
asm#"2", ".8h", ".16b",
[(set (v8i16 V128:$Rd),
(OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
bits<3> imm;
let Inst{18-16} = imm;
def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
V128, V64, vecshiftL16, asm, ".4s", ".4h",
[(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
V128, V128, vecshiftL16,
asm#"2", ".4s", ".8h",
[(set (v4i32 V128:$Rd),
(OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
bits<4> imm;
let Inst{19-16} = imm;
def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
V128, V64, vecshiftL32, asm, ".2d", ".2s",
[(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
bits<5> imm;
let Inst{20-16} = imm;
def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
V128, V128, vecshiftL32,
asm#"2", ".2d", ".4s",
[(set (v2i64 V128:$Rd),
(OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
bits<5> imm;
let Inst{20-16} = imm;
// Vector load/store
// SIMD ldX/stX no-index memory references don't allow the optional
// ", #0" constant and handle post-indexing explicitly, so we use
// a more specialized parse method for them. Otherwise, it's the same as
// the general GPR64sp handling.
class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
string asm, dag oops, dag iops, list<dag> pattern>
: I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
bits<5> Vt;
bits<5> Rn;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29-23} = 0b0011000;
let Inst{22} = L;
let Inst{21-16} = 0b000000;
let Inst{15-12} = opcode;
let Inst{11-10} = size;
let Inst{9-5} = Rn;
let Inst{4-0} = Vt;
class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
string asm, dag oops, dag iops>
: I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
bits<5> Vt;
bits<5> Rn;
bits<5> Xm;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29-23} = 0b0011001;
let Inst{22} = L;
let Inst{21} = 0;
let Inst{20-16} = Xm;
let Inst{15-12} = opcode;
let Inst{11-10} = size;
let Inst{9-5} = Rn;
let Inst{4-0} = Vt;
// The immediate form of AdvSIMD post-indexed addressing is encoded with
// register post-index addressing from the zero register.
multiclass SIMDLdStAliases<string BaseName, string asm, string layout, string Count,
int Offset, int Size> {
// E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
// "ld1\t$Vt, [$Rn], #16"
// may get mapped to
// (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
(!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
!cast<RegisterOperand>("VecList" # Count # layout):$Vt,
XZR), 1>;
// E.g. "ld1.8b { v0, v1 }, [x1], #16"
// "ld1.8b\t$Vt, [$Rn], #16"
// may get mapped to
// (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
(!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
XZR), 0>;
// E.g. "ld1.8b { v0, v1 }, [x1]"
// "ld1\t$Vt, [$Rn]"
// may get mapped to
// (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
(!cast<Instruction>(BaseName # Count # "v" # layout)
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
GPR64sp:$Rn), 0>;
// E.g. "ld1.8b { v0, v1 }, [x1], x2"
// "ld1\t$Vt, [$Rn], $Xm"
// may get mapped to
// (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
(!cast<Instruction>(BaseName # Count # "v" # layout # "_POST")
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
multiclass BaseSIMDLdN<string BaseName, string Count, string asm, string veclist,
int Offset128, int Offset64, bits<4> opcode> {
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
(outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
(ins GPR64sp:$Rn), []>;
def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
(outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
(ins GPR64sp:$Rn), []>;
def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
(outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
(ins GPR64sp:$Rn), []>;
def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
(outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
(ins GPR64sp:$Rn), []>;
def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
(outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
(ins GPR64sp:$Rn), []>;
def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
(outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
(ins GPR64sp:$Rn), []>;
def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
(outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
(ins GPR64sp:$Rn), []>;
def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
(outs GPR64sp:$wback,
!cast<RegisterOperand>(veclist # "16b"):$Vt),
(ins GPR64sp:$Rn,
!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
(outs GPR64sp:$wback,
!cast<RegisterOperand>(veclist # "8h"):$Vt),
(ins GPR64sp:$Rn,
!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
(outs GPR64sp:$wback,
!cast<RegisterOperand>(veclist # "4s"):$Vt),
(ins GPR64sp:$Rn,
!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
(outs GPR64sp:$wback,
!cast<RegisterOperand>(veclist # "2d"):$Vt),
(ins GPR64sp:$Rn,
!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
(outs GPR64sp:$wback,
!cast<RegisterOperand>(veclist # "8b"):$Vt),
(ins GPR64sp:$Rn,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
(outs GPR64sp:$wback,
!cast<RegisterOperand>(veclist # "4h"):$Vt),
(ins GPR64sp:$Rn,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
(outs GPR64sp:$wback,
!cast<RegisterOperand>(veclist # "2s"):$Vt),
(ins GPR64sp:$Rn,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
defm : SIMDLdStAliases<BaseName, asm, "16b", Count, Offset128, 128>;
defm : SIMDLdStAliases<BaseName, asm, "8h", Count, Offset128, 128>;
defm : SIMDLdStAliases<BaseName, asm, "4s", Count, Offset128, 128>;
defm : SIMDLdStAliases<BaseName, asm, "2d", Count, Offset128, 128>;
defm : SIMDLdStAliases<BaseName, asm, "8b", Count, Offset64, 64>;
defm : SIMDLdStAliases<BaseName, asm, "4h", Count, Offset64, 64>;
defm : SIMDLdStAliases<BaseName, asm, "2s", Count, Offset64, 64>;
// Only ld1/st1 has a v1d version.
multiclass BaseSIMDStN<string BaseName, string Count, string asm, string veclist,
int Offset128, int Offset64, bits<4> opcode> {
let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
GPR64sp:$Rn), []>;
def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
GPR64sp:$Rn), []>;
def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
GPR64sp:$Rn), []>;
def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
GPR64sp:$Rn), []>;
def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
GPR64sp:$Rn), []>;
def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
GPR64sp:$Rn), []>;
def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
GPR64sp:$Rn), []>;
def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
(outs GPR64sp:$wback),
(ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
(outs GPR64sp:$wback),
(ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
(outs GPR64sp:$wback),
(ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
(outs GPR64sp:$wback),
(ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
(outs GPR64sp:$wback),
(ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
(outs GPR64sp:$wback),
(ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
(outs GPR64sp:$wback),
(ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
defm : SIMDLdStAliases<BaseName, asm, "16b", Count, Offset128, 128>;
defm : SIMDLdStAliases<BaseName, asm, "8h", Count, Offset128, 128>;
defm : SIMDLdStAliases<BaseName, asm, "4s", Count, Offset128, 128>;
defm : SIMDLdStAliases<BaseName, asm, "2d", Count, Offset128, 128>;
defm : SIMDLdStAliases<BaseName, asm, "8b", Count, Offset64, 64>;
defm : SIMDLdStAliases<BaseName, asm, "4h", Count, Offset64, 64>;
defm : SIMDLdStAliases<BaseName, asm, "2s", Count, Offset64, 64>;
multiclass BaseSIMDLd1<string BaseName, string Count, string asm, string veclist,
int Offset128, int Offset64, bits<4> opcode>
: BaseSIMDLdN<BaseName, Count, asm, veclist, Offset128, Offset64, opcode> {
// LD1 instructions have extra "1d" variants.
let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
(outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
(ins GPR64sp:$Rn), []>;
def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
(outs GPR64sp:$wback,
!cast<RegisterOperand>(veclist # "1d"):$Vt),
(ins GPR64sp:$Rn,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
defm : SIMDLdStAliases<BaseName, asm, "1d", Count, Offset64, 64>;
multiclass BaseSIMDSt1<string BaseName, string Count, string asm, string veclist,
int Offset128, int Offset64, bits<4> opcode>
: BaseSIMDStN<BaseName, Count, asm, veclist, Offset128, Offset64, opcode> {
// ST1 instructions have extra "1d" variants.
let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
(ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
GPR64sp:$Rn), []>;
def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
(outs GPR64sp:$wback),
(ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
defm : SIMDLdStAliases<BaseName, asm, "1d", Count, Offset64, 64>;
multiclass SIMDLd1Multiple<string asm> {
defm One : BaseSIMDLd1<NAME, "One", asm, "VecListOne", 16, 8, 0b0111>;
defm Two : BaseSIMDLd1<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1010>;
defm Three : BaseSIMDLd1<NAME, "Three", asm, "VecListThree", 48, 24, 0b0110>;
defm Four : BaseSIMDLd1<NAME, "Four", asm, "VecListFour", 64, 32, 0b0010>;
multiclass SIMDSt1Multiple<string asm> {
defm One : BaseSIMDSt1<NAME, "One", asm, "VecListOne", 16, 8, 0b0111>;
defm Two : BaseSIMDSt1<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1010>;
defm Three : BaseSIMDSt1<NAME, "Three", asm, "VecListThree", 48, 24, 0b0110>;
defm Four : BaseSIMDSt1<NAME, "Four", asm, "VecListFour", 64, 32, 0b0010>;
multiclass SIMDLd2Multiple<string asm> {
defm Two : BaseSIMDLdN<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1000>;
multiclass SIMDSt2Multiple<string asm> {
defm Two : BaseSIMDStN<NAME, "Two", asm, "VecListTwo", 32, 16, 0b1000>;
multiclass SIMDLd3Multiple<string asm> {
defm Three : BaseSIMDLdN<NAME, "Three", asm, "VecListThree", 48, 24, 0b0100>;
multiclass SIMDSt3Multiple<string asm> {
defm Three : BaseSIMDStN<NAME, "Three", asm, "VecListThree", 48, 24, 0b0100>;
multiclass SIMDLd4Multiple<string asm> {
defm Four : BaseSIMDLdN<NAME, "Four", asm, "VecListFour", 64, 32, 0b0000>;
multiclass SIMDSt4Multiple<string asm> {
defm Four : BaseSIMDStN<NAME, "Four", asm, "VecListFour", 64, 32, 0b0000>;
// AdvSIMD Load/store single-element
class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
string asm, string operands, string cst,
dag oops, dag iops, list<dag> pattern>
: I<oops, iops, asm, operands, cst, pattern> {
bits<5> Vt;
bits<5> Rn;
let Inst{31} = 0;
let Inst{29-24} = 0b001101;
let Inst{22} = L;
let Inst{21} = R;
let Inst{15-13} = opcode;
let Inst{9-5} = Rn;
let Inst{4-0} = Vt;
class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
string asm, string operands, string cst,
dag oops, dag iops, list<dag> pattern>
: I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
bits<5> Vt;
bits<5> Rn;
let Inst{31} = 0;
let Inst{29-24} = 0b001101;
let Inst{22} = L;
let Inst{21} = R;
let Inst{15-13} = opcode;
let Inst{9-5} = Rn;
let Inst{4-0} = Vt;
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
DAGOperand listtype>
: BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
(outs listtype:$Vt), (ins GPR64sp:$Rn),
[]> {
let Inst{30} = Q;
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = S;
let Inst{11-10} = size;
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
string asm, DAGOperand listtype, DAGOperand GPR64pi>
: BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
"$Rn = $wback",
(outs GPR64sp:$wback, listtype:$Vt),
(ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
bits<5> Xm;
let Inst{30} = Q;
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = S;
let Inst{11-10} = size;
multiclass SIMDLdrAliases<string BaseName, string asm, string layout, string Count,
int Offset, int Size> {
// E.g. "ld1r { v0.8b }, [x1], #1"
// "ld1r.8b\t$Vt, [$Rn], #1"
// may get mapped to
// (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
(!cast<Instruction>(BaseName # "v" # layout # "_POST")
!cast<RegisterOperand>("VecList" # Count # layout):$Vt,
XZR), 1>;
// E.g. "ld1r.8b { v0 }, [x1], #1"
// "ld1r.8b\t$Vt, [$Rn], #1"
// may get mapped to
// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
(!cast<Instruction>(BaseName # "v" # layout # "_POST")
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
XZR), 0>;
// E.g. "ld1r.8b { v0 }, [x1]"
// "ld1r.8b\t$Vt, [$Rn]"
// may get mapped to
// (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
(!cast<Instruction>(BaseName # "v" # layout)
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
GPR64sp:$Rn), 0>;
// E.g. "ld1r.8b { v0 }, [x1], x2"
// "ld1r.8b\t$Vt, [$Rn], $Xm"
// may get mapped to
// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
(!cast<Instruction>(BaseName # "v" # layout # "_POST")
!cast<RegisterOperand>("VecList" # Count # Size):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
int Offset1, int Offset2, int Offset4, int Offset8> {
def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
!cast<DAGOperand>("VecList" # Count # "8b")>;
def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
!cast<DAGOperand>("VecList" # Count #"16b")>;
def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
!cast<DAGOperand>("VecList" # Count #"4h")>;
def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
!cast<DAGOperand>("VecList" # Count #"8h")>;
def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
!cast<DAGOperand>("VecList" # Count #"2s")>;
def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
!cast<DAGOperand>("VecList" # Count #"4s")>;
def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
!cast<DAGOperand>("VecList" # Count #"1d")>;
def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
!cast<DAGOperand>("VecList" # Count #"2d")>;
def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
!cast<DAGOperand>("VecList" # Count # "8b"),
!cast<DAGOperand>("GPR64pi" # Offset1)>;
def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
!cast<DAGOperand>("VecList" # Count # "16b"),
!cast<DAGOperand>("GPR64pi" # Offset1)>;
def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
!cast<DAGOperand>("VecList" # Count # "4h"),
!cast<DAGOperand>("GPR64pi" # Offset2)>;
def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
!cast<DAGOperand>("VecList" # Count # "8h"),
!cast<DAGOperand>("GPR64pi" # Offset2)>;
def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
!cast<DAGOperand>("VecList" # Count # "2s"),
!cast<DAGOperand>("GPR64pi" # Offset4)>;
def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
!cast<DAGOperand>("VecList" # Count # "4s"),
!cast<DAGOperand>("GPR64pi" # Offset4)>;
def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
!cast<DAGOperand>("VecList" # Count # "1d"),
!cast<DAGOperand>("GPR64pi" # Offset8)>;
def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
!cast<DAGOperand>("VecList" # Count # "2d"),
!cast<DAGOperand>("GPR64pi" # Offset8)>;
defm : SIMDLdrAliases<NAME, asm, "8b", Count, Offset1, 64>;
defm : SIMDLdrAliases<NAME, asm, "16b", Count, Offset1, 128>;
defm : SIMDLdrAliases<NAME, asm, "4h", Count, Offset2, 64>;
defm : SIMDLdrAliases<NAME, asm, "8h", Count, Offset2, 128>;
defm : SIMDLdrAliases<NAME, asm, "2s", Count, Offset4, 64>;
defm : SIMDLdrAliases<NAME, asm, "4s", Count, Offset4, 128>;
defm : SIMDLdrAliases<NAME, asm, "1d", Count, Offset8, 64>;
defm : SIMDLdrAliases<NAME, asm, "2d", Count, Offset8, 128>;
class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
dag oops, dag iops, list<dag> pattern>
: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
pattern> {
// idx encoded in Q:S:size fields.
bits<4> idx;
let Inst{30} = idx{3};
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = idx{2};
let Inst{11-10} = idx{1-0};
class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
dag oops, dag iops, list<dag> pattern>
: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
oops, iops, pattern> {
// idx encoded in Q:S:size fields.
bits<4> idx;
let Inst{30} = idx{3};
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = idx{2};
let Inst{11-10} = idx{1-0};
class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
dag oops, dag iops>
: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
"$Rn = $wback", oops, iops, []> {
// idx encoded in Q:S:size fields.
bits<4> idx;
bits<5> Xm;
let Inst{30} = idx{3};
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = idx{2};
let Inst{11-10} = idx{1-0};
class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
dag oops, dag iops>
: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
"$Rn = $wback", oops, iops, []> {
// idx encoded in Q:S:size fields.
bits<4> idx;
bits<5> Xm;
let Inst{30} = idx{3};
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = idx{2};
let Inst{11-10} = idx{1-0};
class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
dag oops, dag iops, list<dag> pattern>
: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
pattern> {
// idx encoded in Q:S:size<1> fields.
bits<3> idx;
let Inst{30} = idx{2};
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = idx{1};
let Inst{11} = idx{0};
let Inst{10} = size;
class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
dag oops, dag iops, list<dag> pattern>
: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
oops, iops, pattern> {
// idx encoded in Q:S:size<1> fields.
bits<3> idx;
let Inst{30} = idx{2};
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = idx{1};
let Inst{11} = idx{0};
let Inst{10} = size;
class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
dag oops, dag iops>
: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
"$Rn = $wback", oops, iops, []> {
// idx encoded in Q:S:size<1> fields.
bits<3> idx;
bits<5> Xm;
let Inst{30} = idx{2};
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = idx{1};
let Inst{11} = idx{0};
let Inst{10} = size;
class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
dag oops, dag iops>
: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
"$Rn = $wback", oops, iops, []> {
// idx encoded in Q:S:size<1> fields.
bits<3> idx;
bits<5> Xm;
let Inst{30} = idx{2};
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = idx{1};
let Inst{11} = idx{0};
let Inst{10} = size;
class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
dag oops, dag iops, list<dag> pattern>
: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
pattern> {
// idx encoded in Q:S fields.
bits<2> idx;
let Inst{30} = idx{1};
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = idx{0};
let Inst{11-10} = size;
class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
dag oops, dag iops, list<dag> pattern>
: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
oops, iops, pattern> {
// idx encoded in Q:S fields.
bits<2> idx;
let Inst{30} = idx{1};
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = idx{0};
let Inst{11-10} = size;
class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
string asm, dag oops, dag iops>
: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
"$Rn = $wback", oops, iops, []> {
// idx encoded in Q:S fields.
bits<2> idx;
bits<5> Xm;
let Inst{30} = idx{1};
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = idx{0};
let Inst{11-10} = size;
class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
string asm, dag oops, dag iops>
: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
"$Rn = $wback", oops, iops, []> {
// idx encoded in Q:S fields.
bits<2> idx;
bits<5> Xm;
let Inst{30} = idx{1};
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = idx{0};
let Inst{11-10} = size;
class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
dag oops, dag iops, list<dag> pattern>
: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
pattern> {
// idx encoded in Q field.
bits<1> idx;
let Inst{30} = idx;
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = 0;
let Inst{11-10} = size;
class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
dag oops, dag iops, list<dag> pattern>
: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
oops, iops, pattern> {
// idx encoded in Q field.
bits<1> idx;
let Inst{30} = idx;
let Inst{23} = 0;
let Inst{20-16} = 0b00000;
let Inst{12} = 0;
let Inst{11-10} = size;
class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
string asm, dag oops, dag iops>
: BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
"$Rn = $wback", oops, iops, []> {
// idx encoded in Q field.
bits<1> idx;
bits<5> Xm;
let Inst{30} = idx;
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = 0;
let Inst{11-10} = size;
class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
string asm, dag oops, dag iops>
: BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
"$Rn = $wback", oops, iops, []> {
// idx encoded in Q field.
bits<1> idx;
bits<5> Xm;
let Inst{30} = idx;
let Inst{23} = 1;
let Inst{20-16} = Xm;
let Inst{12} = 0;
let Inst{11-10} = size;
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
RegisterOperand listtype,
RegisterOperand GPR64pi> {
def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
(outs listtype:$dst),
(ins listtype:$Vt, VectorIndexB:$idx,
GPR64sp:$Rn), []>;
def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
(outs GPR64sp:$wback, listtype:$dst),
(ins listtype:$Vt, VectorIndexB:$idx,
GPR64sp:$Rn, GPR64pi:$Xm)>;
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
RegisterOperand listtype,
RegisterOperand GPR64pi> {
def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
(outs listtype:$dst),
(ins listtype:$Vt, VectorIndexH:$idx,
GPR64sp:$Rn), []>;
def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
(outs GPR64sp:$wback, listtype:$dst),
(ins listtype:$Vt, VectorIndexH:$idx,
GPR64sp:$Rn, GPR64pi:$Xm)>;
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
RegisterOperand listtype,
RegisterOperand GPR64pi> {
def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
(outs listtype:$dst),
(ins listtype:$Vt, VectorIndexS:$idx,
GPR64sp:$Rn), []>;
def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
(outs GPR64sp:$wback, listtype:$dst),
(ins listtype:$Vt, VectorIndexS:$idx,
GPR64sp:$Rn, GPR64pi:$Xm)>;
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
RegisterOperand listtype, RegisterOperand GPR64pi> {
def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
(outs listtype:$dst),
(ins listtype:$Vt, VectorIndexD:$idx,
GPR64sp:$Rn), []>;
def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
(outs GPR64sp:$wback, listtype:$dst),
(ins listtype:$Vt, VectorIndexD:$idx,
GPR64sp:$Rn, GPR64pi:$Xm)>;
let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
RegisterOperand listtype, RegisterOperand GPR64pi> {
def i8 : SIMDLdStSingleB<0, R, opcode, asm,
(outs), (ins listtype:$Vt, VectorIndexB:$idx,
GPR64sp:$Rn), []>;
def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
(outs GPR64sp:$wback),
(ins listtype:$Vt, VectorIndexB:$idx,
GPR64sp:$Rn, GPR64pi:$Xm)>;
let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
RegisterOperand listtype, RegisterOperand GPR64pi> {
def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
(outs), (ins listtype:$Vt, VectorIndexH:$idx,
GPR64sp:$Rn), []>;
def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
(outs GPR64sp:$wback),
(ins listtype:$Vt, VectorIndexH:$idx,
GPR64sp:$Rn, GPR64pi:$Xm)>;
let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
RegisterOperand listtype, RegisterOperand GPR64pi> {
def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
(outs), (ins listtype:$Vt, VectorIndexS:$idx,
GPR64sp:$Rn), []>;
def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
(outs GPR64sp:$wback),
(ins listtype:$Vt, VectorIndexS:$idx,
GPR64sp:$Rn, GPR64pi:$Xm)>;
let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
RegisterOperand listtype, RegisterOperand GPR64pi> {
def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
(outs), (ins listtype:$Vt, VectorIndexD:$idx,
GPR64sp:$Rn), []>;
def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
(outs GPR64sp:$wback),
(ins listtype:$Vt, VectorIndexD:$idx,
GPR64sp:$Rn, GPR64pi:$Xm)>;
multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
string Count, int Offset, Operand idxtype> {
// E.g. "ld1 { v0.8b }[0], [x1], #1"
// "ld1\t$Vt, [$Rn], #1"
// may get mapped to
// (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
(!cast<Instruction>(NAME # Type # "_POST")
!cast<RegisterOperand>("VecList" # Count # layout):$Vt,
idxtype:$idx, XZR), 1>;
// E.g. "ld1.8b { v0 }[0], [x1], #1"
// "ld1.8b\t$Vt, [$Rn], #1"
// may get mapped to
// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
(!cast<Instruction>(NAME # Type # "_POST")
!cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
idxtype:$idx, XZR), 0>;
// E.g. "ld1.8b { v0 }[0], [x1]"
// "ld1.8b\t$Vt, [$Rn]"
// may get mapped to
// (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
(!cast<Instruction>(NAME # Type)
!cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
idxtype:$idx, GPR64sp:$Rn), 0>;
// E.g. "ld1.8b { v0 }[0], [x1], x2"
// "ld1.8b\t$Vt, [$Rn], $Xm"
// may get mapped to
// (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
(!cast<Instruction>(NAME # Type # "_POST")
!cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
!cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
multiclass SIMDLdSt1SingleAliases<string asm> {
defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "One", 1, VectorIndexB>;
defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
multiclass SIMDLdSt2SingleAliases<string asm> {
defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Two", 2, VectorIndexB>;
defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4, VectorIndexH>;
defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8, VectorIndexS>;
defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
multiclass SIMDLdSt3SingleAliases<string asm> {
defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Three", 3, VectorIndexB>;
defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6, VectorIndexH>;
defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
multiclass SIMDLdSt4SingleAliases<string asm> {
defm "" : SIMDLdStSingleAliases<asm, "b", "i8", "Four", 4, VectorIndexB>;
defm "" : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8, VectorIndexH>;
defm "" : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
defm "" : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
} // end of 'let Predicates = [HasNEON]'
// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
let Predicates = [HasNEON, HasRDM] in {
class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand regtype, string asm,
string kind, list<dag> pattern>
: BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind,
pattern> {
multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
SDPatternOperator Accum> {
def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
[(set (v4i16 V64:$dst),
(Accum (v4i16 V64:$Rd),
(v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
(v4i16 V64:$Rm)))))]>;
def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
[(set (v8i16 V128:$dst),
(Accum (v8i16 V128:$Rd),
(v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
(v8i16 V128:$Rm)))))]>;
def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
[(set (v2i32 V64:$dst),
(Accum (v2i32 V64:$Rd),
(v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
(v2i32 V64:$Rm)))))]>;
def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
(v4i32 V128:$Rm)))))]>;
multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
SDPatternOperator Accum> {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
V64, V64, V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4i16 V64:$dst),
(Accum (v4i16 V64:$Rd),
(v4i16 (int_aarch64_neon_sqrdmulh
(v4i16 V64:$Rn),
(v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
V128, V128, V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8i16 V128:$dst),
(Accum (v8i16 V128:$Rd),
(v8i16 (int_aarch64_neon_sqrdmulh
(v8i16 V128:$Rn),
(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
V64, V64, V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2i32 V64:$dst),
(Accum (v2i32 V64:$Rd),
(v2i32 (int_aarch64_neon_sqrdmulh
(v2i32 V64:$Rn),
(v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
// FIXME: it would be nice to use the scalar (v1i32) instruction here, but
// an intermediate EXTRACT_SUBREG would be untyped.
// FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
// got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
(i32 (vector_extract
(v4i32 (insert_subvector
(v2i32 (int_aarch64_neon_sqrdmulh
(v2i32 V64:$Rn),
(v2i32 (AArch64duplane32
(v4i32 V128:$Rm),
(i32 0))),
(i64 0))))),
(v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
V128, V128, V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4i32 V128:$dst),
(Accum (v4i32 V128:$Rd),
(v4i32 (int_aarch64_neon_sqrdmulh
(v4i32 V128:$Rn),
(v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
// FIXME: it would be nice to use the scalar (v1i32) instruction here, but
// an intermediate EXTRACT_SUBREG would be untyped.
def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
(i32 (vector_extract
(v4i32 (int_aarch64_neon_sqrdmulh
(v4i32 V128:$Rn),
(v4i32 (AArch64duplane32
(v4i32 V128:$Rm),
(i64 0))))),
(v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
FPR16Op, FPR16Op, V128_lo,
VectorIndexH, asm, ".h", "", "", ".h",
[]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
let Inst{20} = idx{0};
def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i32 FPR32Op:$dst),
(Accum (i32 FPR32Op:$Rd),
(i32 (int_aarch64_neon_sqrdmulh
(i32 FPR32Op:$Rn),
(i32 (vector_extract (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
} // let Predicates = [HasNeon, HasRDM]
// ARMv8.3 Complex ADD/MLA instructions
class ComplexRotationOperand<int Angle, int Remainder, string Type>
: AsmOperandClass {
let PredicateMethod = "isComplexRotation<" # Angle # ", " # Remainder # ">";
let DiagnosticType = "InvalidComplexRotation" # Type;
let Name = "ComplexRotation" # Type;
def complexrotateop : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
SDNodeXForm<imm, [{
return CurDAG->getTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32);
}]>> {
let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">;
let PrintMethod = "printComplexRotationOp<90, 0>";
def complexrotateopodd : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32);
}]>> {
let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">;
let PrintMethod = "printComplexRotationOp<180, 90>";
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode,
RegisterOperand regtype, Operand rottype,
string asm, string kind, list<dag> pattern>
: I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, rottype:$rot), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot"
"|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<1> rot;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21} = 0;
let Inst{20-16} = Rm;
let Inst{15-13} = opcode;
// Non-tied version (FCADD) only has one rotation bit
let Inst{12} = rot;
let Inst{11} = 0;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
//8.3 CompNum - Floating-point complex number support
multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype,
string asm, SDPatternOperator OpNode>{
let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVectorComplex<0, U, 0b01, opcode, V64, rottype,
asm, ".4h",
[(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
(v4f16 V64:$Rn),
(v4f16 V64:$Rm),
(rottype i32:$rot)))]>;
def v8f16 : BaseSIMDThreeSameVectorComplex<1, U, 0b01, opcode, V128, rottype,
asm, ".8h",
[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
(v8f16 V128:$Rn),
(v8f16 V128:$Rm),
(rottype i32:$rot)))]>;
let Predicates = [HasComplxNum, HasNEON] in {
def v2f32 : BaseSIMDThreeSameVectorComplex<0, U, 0b10, opcode, V64, rottype,
asm, ".2s",
[(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
(v2f32 V64:$Rn),
(v2f32 V64:$Rm),
(rottype i32:$rot)))]>;
def v4f32 : BaseSIMDThreeSameVectorComplex<1, U, 0b10, opcode, V128, rottype,
asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v4f32 V128:$Rn),
(v4f32 V128:$Rm),
(rottype i32:$rot)))]>;
def v2f64 : BaseSIMDThreeSameVectorComplex<1, U, 0b11, opcode, V128, rottype,
asm, ".2d",
[(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd),
(v2f64 V128:$Rn),
(v2f64 V128:$Rm),
(rottype i32:$rot)))]>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDThreeSameVectorTiedComplex<bit Q, bit U, bits<2> size,
bits<3> opcode,
RegisterOperand regtype,
Operand rottype, string asm,
string kind, list<dag> pattern>
: I<(outs regtype:$dst),
(ins regtype:$Rd, regtype:$Rn, regtype:$Rm, rottype:$rot), asm,
"{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot"
"|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<2> rot;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28-24} = 0b01110;
let Inst{23-22} = size;
let Inst{21} = 0;
let Inst{20-16} = Rm;
let Inst{15-13} = opcode;
let Inst{12-11} = rot;
let Inst{10} = 1;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode,
Operand rottype, string asm,
SDPatternOperator OpNode> {
let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
def v4f16 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b01, opcode, V64,
rottype, asm, ".4h",
[(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd),
(v4f16 V64:$Rn),
(v4f16 V64:$Rm),
(rottype i32:$rot)))]>;
def v8f16 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b01, opcode, V128,
rottype, asm, ".8h",
[(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
(v8f16 V128:$Rn),
(v8f16 V128:$Rm),
(rottype i32:$rot)))]>;
let Predicates = [HasComplxNum, HasNEON] in {
def v2f32 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b10, opcode, V64,
rottype, asm, ".2s",
[(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd),
(v2f32 V64:$Rn),
(v2f32 V64:$Rm),
(rottype i32:$rot)))]>;
def v4f32 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b10, opcode, V128,
rottype, asm, ".4s",
[(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
(v4f32 V128:$Rn),
(v4f32 V128:$Rm),
(rottype i32:$rot)))]>;
def v2f64 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b11, opcode, V128,
rottype, asm, ".2d",
[(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd),
(v2f64 V128:$Rn),
(v2f64 V128:$Rm),
(rottype i32:$rot)))]>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDIndexedTiedComplex<bit Q, bit U, bit Scalar, bits<2> size,
bit opc1, bit opc2, RegisterOperand dst_reg,
RegisterOperand lhs_reg,
RegisterOperand rhs_reg, Operand vec_idx,
Operand rottype, string asm, string apple_kind,
string dst_kind, string lhs_kind,
string rhs_kind, list<dag> pattern>
: I<(outs dst_reg:$dst),
(ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx, rottype:$rot),
"{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind #
"$idx, $rot" # "|" # apple_kind #
"\t$Rd, $Rn, $Rm$idx, $rot}", "$Rd = $dst", pattern>,
Sched<[WriteV]> {
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
bits<2> rot;
let Inst{31} = 0;
let Inst{30} = Q;
let Inst{29} = U;
let Inst{28} = Scalar;
let Inst{27-24} = 0b1111;
let Inst{23-22} = size;
// Bit 21 must be set by the derived class.
let Inst{20-16} = Rm;
let Inst{15} = opc1;
let Inst{14-13} = rot;
let Inst{12} = opc2;
// Bit 11 must be set by the derived class.
let Inst{10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
// The complex instructions index by pairs of elements, so the VectorIndexes
// don't match the lane types, and the index bits are different to the other
// classes.
multiclass SIMDIndexedTiedComplexHSD<bit U, bit opc1, bit opc2, Operand rottype,
string asm, SDPatternOperator OpNode> {
let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64,
V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h",
".4h", ".h", []> {
bits<1> idx;
let Inst{11} = 0;
let Inst{21} = idx{0};
def v8f16_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b01, opc1, opc2,
V128, V128, V128, VectorIndexS, rottype, asm, ".8h",
".8h", ".8h", ".h", []> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
} // Predicates = HasComplxNum, HasNEON, HasFullFP16]
let Predicates = [HasComplxNum, HasNEON] in {
def v4f32_indexed : BaseSIMDIndexedTiedComplex<1, 1, 0, 0b10, opc1, opc2,
V128, V128, V128, VectorIndexD, rottype, asm, ".4s",
".4s", ".4s", ".s", []> {
bits<1> idx;
let Inst{11} = idx{0};
let Inst{21} = 0;
} // Predicates = [HasComplxNum, HasNEON]
// Crypto extensions
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
list<dag> pat>
: I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
bits<5> Rd;
bits<5> Rn;
let Inst{31-16} = 0b0100111000101000;
let Inst{15-12} = opc;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
: AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
[(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
: AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
"$Rd = $dst",
[(set (v16i8 V128:$dst),
(OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
dag oops, dag iops, list<dag> pat>
: I<oops, iops, asm,
"{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
"|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
bits<5> Rd;
bits<5> Rn;
bits<5> Rm;
let Inst{31-21} = 0b01011110000;
let Inst{20-16} = Rm;
let Inst{15} = 0;
let Inst{14-12} = opc;
let Inst{11-10} = 0b00;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
: SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
(ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
[(set (v4i32 FPR128:$dst),
(OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
(v4i32 V128:$Rm)))]>;
class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
: SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
(ins V128:$Rd, V128:$Rn, V128:$Rm),
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
(v4i32 V128:$Rm)))]>;
class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
: SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
(ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
[(set (v4i32 FPR128:$dst),
(OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
(v4i32 V128:$Rm)))]>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class SHA2OpInst<bits<4> opc, string asm, string kind,
string cstr, dag oops, dag iops,
list<dag> pat>
: I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
"|" # kind # "\t$Rd, $Rn}", cstr, pat>,
bits<5> Rd;
bits<5> Rn;
let Inst{31-16} = 0b0101111000101000;
let Inst{15-12} = opc;
let Inst{11-10} = 0b10;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
: SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
(ins V128:$Rd, V128:$Rn),
[(set (v4i32 V128:$dst),
(OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
: SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
[(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
// Armv8.2-A Crypto extensions
class BaseCryptoV82<dag oops, dag iops, string asm, string asmops, string cst,
list<dag> pattern>
: I <oops, iops, asm, asmops, cst, pattern>, Sched<[WriteV]> {
bits<5> Vd;
bits<5> Vn;
let Inst{31-25} = 0b1100111;
let Inst{9-5} = Vn;
let Inst{4-0} = Vd;
class CryptoRRTied<bits<1>op0, bits<2>op1, string asm, string asmops>
: BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, asmops,
"$Vm = $Vd", []> {
let Inst{31-25} = 0b1100111;
let Inst{24-21} = 0b0110;
let Inst{20-15} = 0b000001;
let Inst{14} = op0;
let Inst{13-12} = 0b00;
let Inst{11-10} = op1;
class CryptoRRTied_2D<bits<1>op0, bits<2>op1, string asm>
: CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d|.2d\t$Vd, $Vn}">;
class CryptoRRTied_4S<bits<1>op0, bits<2>op1, string asm>
: CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s|.4s\t$Vd, $Vn}">;
class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm,
string asmops, string cst>
: BaseCryptoV82<oops, iops, asm , asmops, cst, []> {
bits<5> Vm;
let Inst{24-21} = 0b0011;
let Inst{20-16} = Vm;
let Inst{15} = 0b1;
let Inst{14} = op0;
let Inst{13-12} = 0b00;
let Inst{11-10} = op1;
class CryptoRRR_2D<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
"{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "">;
class CryptoRRRTied_2D<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
"{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
class CryptoRRR_4S<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
"{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "">;
class CryptoRRRTied_4S<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
"{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
class CryptoRRRTied<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs FPR128:$Vdst), (ins FPR128:$Vd, FPR128:$Vn, V128:$Vm),
asm, "{\t$Vd, $Vn, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
class CryptoRRRR<bits<2>op0, string asm, string asmops>
: BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm,
asmops, "", []> {
bits<5> Vm;
bits<5> Va;
let Inst{24-23} = 0b00;
let Inst{22-21} = op0;
let Inst{20-16} = Vm;
let Inst{15} = 0b0;
let Inst{14-10} = Va;
class CryptoRRRR_16B<bits<2>op0, string asm>
: CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b" #
"|.16b\t$Vd, $Vn, $Vm, $Va}"> {
class CryptoRRRR_4S<bits<2>op0, string asm>
: CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s" #
"|.4s\t$Vd, $Vn, $Vm, $Va}"> {
class CryptoRRRi6<string asm>
: BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm,
"{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm" #
"|.2d\t$Vd, $Vn, $Vm, $imm}", "", []> {
bits<6> imm;
bits<5> Vm;
let Inst{24-21} = 0b0100;
let Inst{20-16} = Vm;
let Inst{15-10} = imm;
let Inst{9-5} = Vn;
let Inst{4-0} = Vd;
class CryptoRRRi2Tied<bits<1>op0, bits<2>op1, string asm>
: BaseCryptoV82<(outs V128:$Vdst),
(ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm),
asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm" #
"|.4s\t$Vd, $Vn, $Vm$imm}", "$Vd = $Vdst", []> {
bits<2> imm;
bits<5> Vm;
let Inst{24-21} = 0b0010;
let Inst{20-16} = Vm;
let Inst{15} = 0b1;
let Inst{14} = op0;
let Inst{13-12} = imm;
let Inst{11-10} = op1;
// v8.1 atomic instructions extension:
// * CAS
// * CASP
// * SWP
// * LDOPregister<OP>, and aliases STOPregister<OP>
// Instruction encodings:
// 31 30|29 24|23|22|21|20 16|15|14 10|9 5|4 0
// CAS SZ |001000|1 |A |1 |Rs |R |11111 |Rn |Rt
// CASP 0|SZ|001000|0 |A |1 |Rs |R |11111 |Rn |Rt
// SWP SZ |111000|A |R |1 |Rs |1 |OPC|00|Rn |Rt
// LD SZ |111000|A |R |1 |Rs |0 |OPC|00|Rn |Rt
// ST SZ |111000|A |R |1 |Rs |0 |OPC|00|Rn |11111
// Instruction syntax:
// CAS{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
// CAS{<order>} <Xs>, <Xt>, [<Xn|SP>]
// CASP{<order>} <Ws>, <W(s+1)>, <Wt>, <W(t+1)>, [<Xn|SP>]
// CASP{<order>} <Xs>, <X(s+1)>, <Xt>, <X(t+1)>, [<Xn|SP>]
// SWP{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
// SWP{<order>} <Xs>, <Xt>, [<Xn|SP>]
// LD<OP>{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
// LD<OP>{<order>} <Xs>, <Xt>, [<Xn|SP>]
// ST<OP>{<order>}[<size>] <Ws>, [<Xn|SP>]
// ST<OP>{<order>} <Xs>, [<Xn|SP>]
let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
string cstr, list<dag> pattern>
: I<oops, iops, asm, operands, cstr, pattern> {
bits<2> Sz;
bit NP;
bit Acq;
bit Rel;
bits<5> Rs;
bits<5> Rn;
bits<5> Rt;
let Inst{31-30} = Sz;
let Inst{29-24} = 0b001000;
let Inst{23} = NP;
let Inst{22} = Acq;
let Inst{21} = 0b1;
let Inst{20-16} = Rs;
let Inst{15} = Rel;
let Inst{14-10} = 0b11111;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let Predicates = [HasLSE];
class BaseCAS<string order, string size, RegisterClass RC>
: BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
"cas" # order # size, "\t$Rs, $Rt, [$Rn]",
"$out = $Rs",[]>,
Sched<[WriteAtomic]> {
let NP = 1;
multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
let Sz = 0b00, Acq = Acq, Rel = Rel in def B : BaseCAS<order, "b", GPR32>;
let Sz = 0b01, Acq = Acq, Rel = Rel in def H : BaseCAS<order, "h", GPR32>;
let Sz = 0b10, Acq = Acq, Rel = Rel in def W : BaseCAS<order, "", GPR32>;
let Sz = 0b11, Acq = Acq, Rel = Rel in def X : BaseCAS<order, "", GPR64>;
class BaseCASP<string order, string size, RegisterOperand RC>
: BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
"casp" # order # size, "\t$Rs, $Rt, [$Rn]",
"$out = $Rs",[]>,
Sched<[WriteAtomic]> {
let NP = 0;
multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
let Sz = 0b00, Acq = Acq, Rel = Rel in
def W : BaseCASP<order, "", WSeqPairClassOperand>;
let Sz = 0b01, Acq = Acq, Rel = Rel in
def X : BaseCASP<order, "", XSeqPairClassOperand>;
let Predicates = [HasLSE] in
class BaseSWP<string order, string size, RegisterClass RC>
: I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
"\t$Rs, $Rt, [$Rn]","",[]>,
Sched<[WriteAtomic]> {
bits<2> Sz;
bit Acq;
bit Rel;
bits<5> Rs;
bits<3> opc = 0b000;
bits<5> Rn;
bits<5> Rt;
let Inst{31-30} = Sz;
let Inst{29-24} = 0b111000;
let Inst{23} = Acq;
let Inst{22} = Rel;
let Inst{21} = 0b1;
let Inst{20-16} = Rs;
let Inst{15} = 0b1;
let Inst{14-12} = opc;
let Inst{11-10} = 0b00;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let Predicates = [HasLSE];
multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
let Sz = 0b00, Acq = Acq, Rel = Rel in def B : BaseSWP<order, "b", GPR32>;
let Sz = 0b01, Acq = Acq, Rel = Rel in def H : BaseSWP<order, "h", GPR32>;
let Sz = 0b10, Acq = Acq, Rel = Rel in def W : BaseSWP<order, "", GPR32>;
let Sz = 0b11, Acq = Acq, Rel = Rel in def X : BaseSWP<order, "", GPR64>;
let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
: I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
"\t$Rs, $Rt, [$Rn]","",[]>,
Sched<[WriteAtomic]> {
bits<2> Sz;
bit Acq;
bit Rel;
bits<5> Rs;
bits<3> opc;
bits<5> Rn;
bits<5> Rt;
let Inst{31-30} = Sz;
let Inst{29-24} = 0b111000;
let Inst{23} = Acq;
let Inst{22} = Rel;
let Inst{21} = 0b1;
let Inst{20-16} = Rs;
let Inst{15} = 0b0;
let Inst{14-12} = opc;
let Inst{11-10} = 0b00;
let Inst{9-5} = Rn;
let Inst{4-0} = Rt;
let Predicates = [HasLSE];
multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel,
string order> {
let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in
def B : BaseLDOPregister<op, order, "b", GPR32>;
let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in
def H : BaseLDOPregister<op, order, "h", GPR32>;
let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in
def W : BaseLDOPregister<op, order, "", GPR32>;
let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in
def X : BaseLDOPregister<op, order, "", GPR64>;
// Differing SrcRHS and DstRHS allow you to cover CLR & SUB by giving a more
// complex DAG for DstRHS.
let Predicates = [HasLSE] in
multiclass LDOPregister_patterns_ord_dag<string inst, string suffix, string op,
string size, dag SrcRHS, dag DstRHS> {
def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # suffix) DstRHS, GPR64sp:$Rn)>;
def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "A" # suffix) DstRHS, GPR64sp:$Rn)>;
def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "L" # suffix) DstRHS, GPR64sp:$Rn)>;
def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, SrcRHS),
(!cast<Instruction>(inst # "AL" # suffix) DstRHS, GPR64sp:$Rn)>;
multiclass LDOPregister_patterns_ord<string inst, string suffix, string op,
string size, dag RHS> {
defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, RHS, RHS>;
multiclass LDOPregister_patterns_ord_mod<string inst, string suffix, string op,
string size, dag LHS, dag RHS> {
defm : LDOPregister_patterns_ord_dag<inst, suffix, op, size, LHS, RHS>;
multiclass LDOPregister_patterns<string inst, string op> {
defm : LDOPregister_patterns_ord<inst, "X", op, "64", (i64 GPR64:$Rm)>;
defm : LDOPregister_patterns_ord<inst, "W", op, "32", (i32 GPR32:$Rm)>;
defm : LDOPregister_patterns_ord<inst, "H", op, "16", (i32 GPR32:$Rm)>;
defm : LDOPregister_patterns_ord<inst, "B", op, "8", (i32 GPR32:$Rm)>;
multiclass LDOPregister_patterns_mod<string inst, string op, string mod> {
defm : LDOPregister_patterns_ord_mod<inst, "X", op, "64",
(i64 GPR64:$Rm),
(i64 (!cast<Instruction>(mod#Xrr) XZR, GPR64:$Rm))>;
defm : LDOPregister_patterns_ord_mod<inst, "W", op, "32",
(i32 GPR32:$Rm),
(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
defm : LDOPregister_patterns_ord_mod<inst, "H", op, "16",
(i32 GPR32:$Rm),
(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
defm : LDOPregister_patterns_ord_mod<inst, "B", op, "8",
(i32 GPR32:$Rm),
(i32 (!cast<Instruction>(mod#Wrr) WZR, GPR32:$Rm))>;
let Predicates = [HasLSE] in
multiclass CASregister_patterns_ord_dag<string inst, string suffix, string op,
string size, dag OLD, dag NEW> {
def : Pat<(!cast<PatFrag>(op#"_"#size#"_monotonic") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # suffix) OLD, NEW, GPR64sp:$Rn)>;
def : Pat<(!cast<PatFrag>(op#"_"#size#"_acquire") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "A" # suffix) OLD, NEW, GPR64sp:$Rn)>;
def : Pat<(!cast<PatFrag>(op#"_"#size#"_release") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "L" # suffix) OLD, NEW, GPR64sp:$Rn)>;
def : Pat<(!cast<PatFrag>(op#"_"#size#"_acq_rel") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
def : Pat<(!cast<PatFrag>(op#"_"#size#"_seq_cst") GPR64sp:$Rn, OLD, NEW),
(!cast<Instruction>(inst # "AL" # suffix) OLD, NEW, GPR64sp:$Rn)>;
multiclass CASregister_patterns_ord<string inst, string suffix, string op,
string size, dag OLD, dag NEW> {
defm : CASregister_patterns_ord_dag<inst, suffix, op, size, OLD, NEW>;
multiclass CASregister_patterns<string inst, string op> {
defm : CASregister_patterns_ord<inst, "X", op, "64",
(i64 GPR64:$Rold), (i64 GPR64:$Rnew)>;
defm : CASregister_patterns_ord<inst, "W", op, "32",
(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
defm : CASregister_patterns_ord<inst, "H", op, "16",
(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
defm : CASregister_patterns_ord<inst, "B", op, "8",
(i32 GPR32:$Rold), (i32 GPR32:$Rnew)>;
let Predicates = [HasLSE] in
class BaseSTOPregister<string asm, RegisterClass OP, Register Reg,
Instruction inst> :
InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;
multiclass STOPregister<string asm, string instr> {
def : BaseSTOPregister<asm # "lb", GPR32, WZR,
!cast<Instruction>(instr # "LB")>;
def : BaseSTOPregister<asm # "lh", GPR32, WZR,
!cast<Instruction>(instr # "LH")>;
def : BaseSTOPregister<asm # "l", GPR32, WZR,
!cast<Instruction>(instr # "LW")>;
def : BaseSTOPregister<asm # "l", GPR64, XZR,
!cast<Instruction>(instr # "LX")>;
def : BaseSTOPregister<asm # "b", GPR32, WZR,
!cast<Instruction>(instr # "B")>;
def : BaseSTOPregister<asm # "h", GPR32, WZR,
!cast<Instruction>(instr # "H")>;
def : BaseSTOPregister<asm, GPR32, WZR,
!cast<Instruction>(instr # "W")>;
def : BaseSTOPregister<asm, GPR64, XZR,
!cast<Instruction>(instr # "X")>;
// Allow the size specifier tokens to be upper case, not just lower.
def : TokenAlias<".4B", ".4b">; // Add dot product
def : TokenAlias<".8B", ".8b">;
def : TokenAlias<".4H", ".4h">;
def : TokenAlias<".2S", ".2s">;
def : TokenAlias<".1D", ".1d">;
def : TokenAlias<".16B", ".16b">;
def : TokenAlias<".8H", ".8h">;
def : TokenAlias<".4S", ".4s">;
def : TokenAlias<".2D", ".2d">;
def : TokenAlias<".1Q", ".1q">;
def : TokenAlias<".2H", ".2h">;
def : TokenAlias<".B", ".b">;
def : TokenAlias<".H", ".h">;
def : TokenAlias<".S", ".s">;
def : TokenAlias<".D", ".d">;
def : TokenAlias<".Q", ".q">;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5139ae5ccaf1..08f80c9aa361 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1,6909 +1,6938 @@
//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file contains the AArch64 implementation of the TargetInstrInfo class.
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <cassert>
#include <cstdint>
#include <iterator>
#include <utility>
using namespace llvm;
#include ""
static cl::opt<unsigned> TBZDisplacementBits(
"aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
static cl::opt<unsigned> CBZDisplacementBits(
"aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
static cl::opt<unsigned>
BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
cl::desc("Restrict range of Bcc instructions (DEBUG)"));
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
RI(STI.getTargetTriple()), Subtarget(STI) {}
/// GetInstSize - Return the number of bytes of code the specified
/// instruction may be. This returns the maximum number of bytes.
unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
const MachineBasicBlock &MBB = *MI.getParent();
const MachineFunction *MF = MBB.getParent();
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
auto Op = MI.getOpcode();
if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
// Meta-instructions emit no code.
if (MI.isMetaInstruction())
return 0;
// FIXME: We currently only handle pseudoinstructions that don't get expanded
// before the assembly printer.
unsigned NumBytes = 0;
const MCInstrDesc &Desc = MI.getDesc();
switch (Desc.getOpcode()) {
// Anything not explicitly designated otherwise is a normal 4-byte insn.
NumBytes = 4;
case TargetOpcode::STACKMAP:
// The upper bound for a stackmap intrinsic is the full length of its shadow
NumBytes = StackMapOpers(&MI).getNumPatchBytes();
assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
case TargetOpcode::PATCHPOINT:
// The size of the patchpoint intrinsic is the number of bytes requested
NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
// This gets lowered to an instruction sequence which takes 16 bytes
NumBytes = 16;
case AArch64::SpeculationBarrierISBDSBEndBB:
// This gets lowered to 2 4-byte instructions.
NumBytes = 8;
case AArch64::SpeculationBarrierSBEndBB:
// This gets lowered to 1 4-byte instructions.
NumBytes = 4;
case AArch64::JumpTableDest32:
case AArch64::JumpTableDest16:
case AArch64::JumpTableDest8:
NumBytes = 12;
case AArch64::SPACE:
NumBytes = MI.getOperand(1).getImm();
case TargetOpcode::BUNDLE:
NumBytes = getInstBundleLength(MI);
return NumBytes;
unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
unsigned Size = 0;
MachineBasicBlock::const_instr_iterator I = MI.getIterator();
MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
while (++I != E && I->isInsideBundle()) {
assert(!I->isBundle() && "No nested bundle!");
Size += getInstSizeInBytes(*I);
return Size;
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
SmallVectorImpl<MachineOperand> &Cond) {
// Block ends with fall-through condbranch.
switch (LastInst->getOpcode()) {
llvm_unreachable("Unknown branch instruction?");
case AArch64::Bcc:
Target = LastInst->getOperand(1).getMBB();
case AArch64::CBZW:
case AArch64::CBZX:
case AArch64::CBNZW:
case AArch64::CBNZX:
Target = LastInst->getOperand(1).getMBB();
case AArch64::TBZW:
case AArch64::TBZX:
case AArch64::TBNZW:
case AArch64::TBNZX:
Target = LastInst->getOperand(2).getMBB();
static unsigned getBranchDisplacementBits(unsigned Opc) {
switch (Opc) {
llvm_unreachable("unexpected opcode!");
case AArch64::B:
return 64;
case AArch64::TBNZW:
case AArch64::TBZW:
case AArch64::TBNZX:
case AArch64::TBZX:
return TBZDisplacementBits;
case AArch64::CBNZW:
case AArch64::CBZW:
case AArch64::CBNZX:
case AArch64::CBZX:
return CBZDisplacementBits;
case AArch64::Bcc:
return BCCDisplacementBits;
bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
int64_t BrOffset) const {
unsigned Bits = getBranchDisplacementBits(BranchOp);
assert(Bits >= 3 && "max branch displacement must be enough to jump"
"over conditional branch expansion");
return isIntN(Bits, BrOffset / 4);
MachineBasicBlock *
AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
llvm_unreachable("unexpected opcode!");
case AArch64::B:
return MI.getOperand(0).getMBB();
case AArch64::TBZW:
case AArch64::TBNZW:
case AArch64::TBZX:
case AArch64::TBNZX:
return MI.getOperand(2).getMBB();
case AArch64::CBZW:
case AArch64::CBNZW:
case AArch64::CBZX:
case AArch64::CBNZX:
case AArch64::Bcc:
return MI.getOperand(1).getMBB();
// Branch analysis.
bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const {
// If the block has no terminators, it just falls into the block after it.
MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
if (I == MBB.end())
return false;
// Skip over SpeculationBarrierEndBB terminators
if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
if (!isUnpredicatedTerminator(*I))
return false;
// Get the last instruction in the block.
MachineInstr *LastInst = &*I;
// If there is only one terminator instruction, process it.
unsigned LastOpc = LastInst->getOpcode();
if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
if (isUncondBranchOpcode(LastOpc)) {
TBB = LastInst->getOperand(0).getMBB();
return false;
if (isCondBranchOpcode(LastOpc)) {
// Block ends with fall-through condbranch.
parseCondBranch(LastInst, TBB, Cond);
return false;
return true; // Can't handle indirect branch.
// Get the instruction before it if it is a terminator.
MachineInstr *SecondLastInst = &*I;
unsigned SecondLastOpc = SecondLastInst->getOpcode();
// If AllowModify is true and the block ends with two or more unconditional
// branches, delete all but the first unconditional branch.
if (AllowModify && isUncondBranchOpcode(LastOpc)) {
while (isUncondBranchOpcode(SecondLastOpc)) {
LastInst = SecondLastInst;
LastOpc = LastInst->getOpcode();
if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
// Return now the only terminator is an unconditional branch.
TBB = LastInst->getOperand(0).getMBB();
return false;
} else {
SecondLastInst = &*I;
SecondLastOpc = SecondLastInst->getOpcode();
// If there are three terminators, we don't know what sort of block this is.
if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
return true;
// If the block ends with a B and a Bcc, handle it.
if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
parseCondBranch(SecondLastInst, TBB, Cond);
FBB = LastInst->getOperand(0).getMBB();
return false;
// If the block ends with two unconditional branches, handle it. The second
// one is not executed, so remove it.
if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
TBB = SecondLastInst->getOperand(0).getMBB();
I = LastInst;
if (AllowModify)
return false;
// ...likewise if it ends with an indirect branch followed by an unconditional
// branch.
if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
I = LastInst;
if (AllowModify)
return true;
// Otherwise, can't handle this.
return true;
bool AArch64InstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
if (Cond[0].getImm() != -1) {
// Regular Bcc
AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
} else {
// Folded compare-and-branch
switch (Cond[1].getImm()) {
llvm_unreachable("Unknown conditional branch!");
case AArch64::CBZW:
case AArch64::CBNZW:
case AArch64::CBZX:
case AArch64::CBNZX:
case AArch64::TBZW:
case AArch64::TBNZW:
case AArch64::TBZX:
case AArch64::TBNZX:
return false;
unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved) const {
MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
if (I == MBB.end())
return 0;
if (!isUncondBranchOpcode(I->getOpcode()) &&
return 0;
// Remove the branch.
I = MBB.end();
if (I == MBB.begin()) {
if (BytesRemoved)
*BytesRemoved = 4;
return 1;
if (!isCondBranchOpcode(I->getOpcode())) {
if (BytesRemoved)
*BytesRemoved = 4;
return 1;
// Remove the branch.
if (BytesRemoved)
*BytesRemoved = 8;
return 2;
void AArch64InstrInfo::instantiateCondBranch(
MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
ArrayRef<MachineOperand> Cond) const {
if (Cond[0].getImm() != -1) {
// Regular Bcc
BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
} else {
// Folded compare-and-branch
// Note that we use addOperand instead of addReg to keep the flags.
const MachineInstrBuilder MIB =
BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
if (Cond.size() > 3)
unsigned AArch64InstrInfo::insertBranch(
MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
// Shouldn't be a fall through.
assert(TBB && "insertBranch must not be told to insert a fallthrough");
if (!FBB) {
if (Cond.empty()) // Unconditional branch?
BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
instantiateCondBranch(MBB, DL, TBB, Cond);
if (BytesAdded)
*BytesAdded = 4;
return 1;
// Two-way conditional branch.
instantiateCondBranch(MBB, DL, TBB, Cond);
BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
if (BytesAdded)
*BytesAdded = 8;
return 2;
// Find the original register that VReg is copied from.
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
while (Register::isVirtualRegister(VReg)) {
const MachineInstr *DefMI = MRI.getVRegDef(VReg);
if (!DefMI->isFullCopy())
return VReg;
VReg = DefMI->getOperand(1).getReg();
return VReg;
// Determine if VReg is defined by an instruction that can be folded into a
// csel instruction. If so, return the folded opcode, and the replacement
// register.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
unsigned *NewVReg = nullptr) {
VReg = removeCopies(MRI, VReg);
if (!Register::isVirtualRegister(VReg))
return 0;
bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
const MachineInstr *DefMI = MRI.getVRegDef(VReg);
unsigned Opc = 0;
unsigned SrcOpNum = 0;
switch (DefMI->getOpcode()) {
case AArch64::ADDSXri:
case AArch64::ADDSWri:
// if NZCV is used, do not fold.
if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
return 0;
// fall-through to ADDXri and ADDWri.
case AArch64::ADDXri:
case AArch64::ADDWri:
// add x, 1 -> csinc.
if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
DefMI->getOperand(3).getImm() != 0)
return 0;
SrcOpNum = 1;
Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
case AArch64::ORNXrr:
case AArch64::ORNWrr: {
// not x -> csinv, represented as orn dst, xzr, src.
unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
return 0;
SrcOpNum = 2;
Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
case AArch64::SUBSXrr:
case AArch64::SUBSWrr:
// if NZCV is used, do not fold.
if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
return 0;
// fall-through to SUBXrr and SUBWrr.
case AArch64::SUBXrr:
case AArch64::SUBWrr: {
// neg x -> csneg, represented as sub dst, xzr, src.
unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
return 0;
SrcOpNum = 2;
Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
return 0;
assert(Opc && SrcOpNum && "Missing parameters");
if (NewVReg)
*NewVReg = DefMI->getOperand(SrcOpNum).getReg();
return Opc;
bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Cond,
Register DstReg, Register TrueReg,
Register FalseReg, int &CondCycles,
int &TrueCycles,
int &FalseCycles) const {
// Check register classes.
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *RC =
RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
if (!RC)
return false;
// Also need to check the dest regclass, in case we're trying to optimize
// something like:
// %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
return false;
// Expanding cbz/tbz requires an extra cycle of latency on the condition.
unsigned ExtraCondLat = Cond.size() != 1;
// GPRs are handled by csel.
// FIXME: Fold in x+1, -x, and ~x when applicable.
if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
// Single-cycle csel, csinc, csinv, and csneg.
CondCycles = 1 + ExtraCondLat;
TrueCycles = FalseCycles = 1;
if (canFoldIntoCSel(MRI, TrueReg))
TrueCycles = 0;
else if (canFoldIntoCSel(MRI, FalseReg))
FalseCycles = 0;
return true;
// Scalar floating point is handled by fcsel.
// FIXME: Form fabs, fmin, and fmax when applicable.
if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
AArch64::FPR32RegClass.hasSubClassEq(RC)) {
CondCycles = 5 + ExtraCondLat;
TrueCycles = FalseCycles = 2;
return true;
// Can't do vectors.
return false;
void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond,
Register TrueReg, Register FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
// Parse the condition code, see parseCondBranch() above.
AArch64CC::CondCode CC;
switch (Cond.size()) {
llvm_unreachable("Unknown condition opcode in Cond");
case 1: //
CC = AArch64CC::CondCode(Cond[0].getImm());
case 3: { // cbz/cbnz
// We must insert a compare against 0.
bool Is64Bit;
switch (Cond[1].getImm()) {
llvm_unreachable("Unknown branch opcode in Cond");
case AArch64::CBZW:
Is64Bit = false;
CC = AArch64CC::EQ;
case AArch64::CBZX:
Is64Bit = true;
CC = AArch64CC::EQ;
case AArch64::CBNZW:
Is64Bit = false;
CC = AArch64CC::NE;
case AArch64::CBNZX:
Is64Bit = true;
CC = AArch64CC::NE;
Register SrcReg = Cond[2].getReg();
if (Is64Bit) {
// cmp reg, #0 is actually subs xzr, reg, #0.
MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
} else {
MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
case 4: { // tbz/tbnz
// We must insert a tst instruction.
switch (Cond[1].getImm()) {
llvm_unreachable("Unknown branch opcode in Cond");
case AArch64::TBZW:
case AArch64::TBZX:
CC = AArch64CC::EQ;
case AArch64::TBNZW:
case AArch64::TBNZX:
CC = AArch64CC::NE;
// cmp reg, #foo is actually ands xzr, reg, #1<<foo.
if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
unsigned Opc = 0;
const TargetRegisterClass *RC = nullptr;
bool TryFold = false;
if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
RC = &AArch64::GPR64RegClass;
Opc = AArch64::CSELXr;
TryFold = true;
} else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
RC = &AArch64::GPR32RegClass;
Opc = AArch64::CSELWr;
TryFold = true;
} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
RC = &AArch64::FPR64RegClass;
Opc = AArch64::FCSELDrrr;
} else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
RC = &AArch64::FPR32RegClass;
Opc = AArch64::FCSELSrrr;
assert(RC && "Unsupported regclass");
// Try folding simple instructions into the csel.
if (TryFold) {
unsigned NewVReg = 0;
unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
if (FoldedOpc) {
// The folded opcodes csinc, csinc and csneg apply the operation to
// FalseReg, so we need to invert the condition.
CC = AArch64CC::getInvertedCondCode(CC);
TrueReg = FalseReg;
} else
FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
// Fold the operation. Leave any dead instructions for DCE to clean up.
if (FoldedOpc) {
FalseReg = NewVReg;
Opc = FoldedOpc;
// The extends the live range of NewVReg.
// Pull all virtual register into the appropriate class.
MRI.constrainRegClass(TrueReg, RC);
MRI.constrainRegClass(FalseReg, RC);
// Insert the csel.
BuildMI(MBB, I, DL, get(Opc), DstReg)
/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
uint64_t Imm = MI.getOperand(1).getImm();
uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
uint64_t Encoding;
return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
// FIXME: this implementation should be micro-architecture dependent, so a
// micro-architecture target hook should be introduced here in future.
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
if (!Subtarget.hasCustomCheapAsMoveHandling())
return MI.isAsCheapAsAMove();
const unsigned Opcode = MI.getOpcode();
// Firstly, check cases gated by features.
if (Subtarget.hasZeroCycleZeroingFP()) {
if (Opcode == AArch64::FMOVH0 ||
Opcode == AArch64::FMOVS0 ||
Opcode == AArch64::FMOVD0)
return true;
if (Subtarget.hasZeroCycleZeroingGP()) {
if (Opcode == TargetOpcode::COPY &&
(MI.getOperand(1).getReg() == AArch64::WZR ||
MI.getOperand(1).getReg() == AArch64::XZR))
return true;
// Secondly, check cases specific to sub-targets.
if (Subtarget.hasExynosCheapAsMoveHandling()) {
if (isExynosCheapAsMove(MI))
return true;
return MI.isAsCheapAsAMove();
// Finally, check generic cases.
switch (Opcode) {
return false;
// add/sub on register without shift
case AArch64::ADDWri:
case AArch64::ADDXri:
case AArch64::SUBWri:
case AArch64::SUBXri:
return (MI.getOperand(3).getImm() == 0);
// logical ops on immediate
case AArch64::ANDWri:
case AArch64::ANDXri:
case AArch64::EORWri:
case AArch64::EORXri:
case AArch64::ORRWri:
case AArch64::ORRXri:
return true;
// logical ops on register without shift
case AArch64::ANDWrr:
case AArch64::ANDXrr:
case AArch64::BICWrr:
case AArch64::BICXrr:
case AArch64::EONWrr:
case AArch64::EONXrr:
case AArch64::EORWrr:
case AArch64::EORXrr:
case AArch64::ORNWrr:
case AArch64::ORNXrr:
case AArch64::ORRWrr:
case AArch64::ORRXrr:
return true;
// If MOVi32imm or MOVi64imm can be expanded into ORRWri or
// ORRXri, it is as cheap as MOV
case AArch64::MOVi32imm:
return canBeExpandedToORR(MI, 32);
case AArch64::MOVi64imm:
return canBeExpandedToORR(MI, 64);
llvm_unreachable("Unknown opcode to check as cheap as a move!");
bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
switch (MI.getOpcode()) {
return false;
case AArch64::ADDWrs:
case AArch64::ADDXrs:
case AArch64::ADDSWrs:
case AArch64::ADDSXrs: {
unsigned Imm = MI.getOperand(3).getImm();
unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
if (ShiftVal == 0)
return true;
return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
case AArch64::ADDWrx:
case AArch64::ADDXrx:
case AArch64::ADDXrx64:
case AArch64::ADDSWrx:
case AArch64::ADDSXrx:
case AArch64::ADDSXrx64: {
unsigned Imm = MI.getOperand(3).getImm();
switch (AArch64_AM::getArithExtendType(Imm)) {
return false;
case AArch64_AM::UXTB:
case AArch64_AM::UXTH:
case AArch64_AM::UXTW:
case AArch64_AM::UXTX:
return AArch64_AM::getArithShiftValue(Imm) <= 4;
case AArch64::SUBWrs:
case AArch64::SUBSWrs: {
unsigned Imm = MI.getOperand(3).getImm();
unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
return ShiftVal == 0 ||
(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
case AArch64::SUBXrs:
case AArch64::SUBSXrs: {
unsigned Imm = MI.getOperand(3).getImm();
unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
return ShiftVal == 0 ||
(AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
case AArch64::SUBWrx:
case AArch64::SUBXrx:
case AArch64::SUBXrx64:
case AArch64::SUBSWrx:
case AArch64::SUBSXrx:
case AArch64::SUBSXrx64: {
unsigned Imm = MI.getOperand(3).getImm();
switch (AArch64_AM::getArithExtendType(Imm)) {
return false;
case AArch64_AM::UXTB:
case AArch64_AM::UXTH:
case AArch64_AM::UXTW:
case AArch64_AM::UXTX:
return AArch64_AM::getArithShiftValue(Imm) == 0;
case AArch64::LDRBBroW:
case AArch64::LDRBBroX:
case AArch64::LDRBroW:
case AArch64::LDRBroX:
case AArch64::LDRDroW:
case AArch64::LDRDroX:
case AArch64::LDRHHroW:
case AArch64::LDRHHroX:
case AArch64::LDRHroW:
case AArch64::LDRHroX:
case AArch64::LDRQroW:
case AArch64::LDRQroX:
case AArch64::LDRSBWroW:
case AArch64::LDRSBWroX:
case AArch64::LDRSBXroW:
case AArch64::LDRSBXroX:
case AArch64::LDRSHWroW:
case AArch64::LDRSHWroX:
case AArch64::LDRSHXroW:
case AArch64::LDRSHXroX:
case AArch64::LDRSWroW:
case AArch64::LDRSWroX:
case AArch64::LDRSroW:
case AArch64::LDRSroX:
case AArch64::LDRWroW:
case AArch64::LDRWroX:
case AArch64::LDRXroW:
case AArch64::LDRXroX:
case AArch64::PRFMroW:
case AArch64::PRFMroX:
case AArch64::STRBBroW:
case AArch64::STRBBroX:
case AArch64::STRBroW:
case AArch64::STRBroX:
case AArch64::STRDroW:
case AArch64::STRDroX:
case AArch64::STRHHroW:
case AArch64::STRHHroX:
case AArch64::STRHroW:
case AArch64::STRHroX:
case AArch64::STRQroW:
case AArch64::STRQroX:
case AArch64::STRSroW:
case AArch64::STRSroX:
case AArch64::STRWroW:
case AArch64::STRWroX:
case AArch64::STRXroW:
case AArch64::STRXroX: {
unsigned IsSigned = MI.getOperand(3).getImm();
return !IsSigned;
bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
switch (Opc) {
return false;
case AArch64::SEH_StackAlloc:
case AArch64::SEH_SaveFPLR:
case AArch64::SEH_SaveFPLR_X:
case AArch64::SEH_SaveReg:
case AArch64::SEH_SaveReg_X:
case AArch64::SEH_SaveRegP:
case AArch64::SEH_SaveRegP_X:
case AArch64::SEH_SaveFReg:
case AArch64::SEH_SaveFReg_X:
case AArch64::SEH_SaveFRegP:
case AArch64::SEH_SaveFRegP_X:
case AArch64::SEH_SetFP:
case AArch64::SEH_AddFP:
case AArch64::SEH_Nop:
case AArch64::SEH_PrologEnd:
case AArch64::SEH_EpilogStart:
case AArch64::SEH_EpilogEnd:
return true;
bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
Register &SrcReg, Register &DstReg,
unsigned &SubIdx) const {
switch (MI.getOpcode()) {
return false;
case AArch64::SBFMXri: // aka sxtw
case AArch64::UBFMXri: // aka uxtw
// Check for the 32 -> 64 bit extension case, these instructions can do
// much more.
if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
return false;
// This is a signed or unsigned 32 -> 64 bit extension.
SrcReg = MI.getOperand(1).getReg();
DstReg = MI.getOperand(0).getReg();
SubIdx = AArch64::sub_32;
return true;
bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
const MachineInstr &MIa, const MachineInstr &MIb) const {
const TargetRegisterInfo *TRI = &getRegisterInfo();
const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
int64_t OffsetA = 0, OffsetB = 0;
unsigned WidthA = 0, WidthB = 0;
bool OffsetAIsScalable = false, OffsetBIsScalable = false;
assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
return false;
// Retrieve the base, offset from the base and width. Width
// is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
// base are identical, and the offset of a lower memory access +
// the width doesn't overlap the offset of a higher memory access,
// then the memory accesses are different.
// If OffsetAIsScalable and OffsetBIsScalable are both true, they
// are assumed to have the same scale (vscale).
if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
WidthA, TRI) &&
getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
WidthB, TRI)) {
if (BaseOpA->isIdenticalTo(*BaseOpB) &&
OffsetAIsScalable == OffsetBIsScalable) {
int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
if (LowOffset + LowWidth <= HighOffset)
return true;
return false;
bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const {
if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
return true;
switch (MI.getOpcode()) {
case AArch64::HINT:
// CSDB hints are scheduling barriers.
if (MI.getOperand(0).getImm() == 0x14)
return true;
case AArch64::DSB:
case AArch64::ISB:
// DSB and ISB also are scheduling barriers.
return true;
return isSEHInstruction(MI);
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
/// Return true if the comparison instruction can be analyzed.
bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
Register &SrcReg2, int &CmpMask,
int &CmpValue) const {
// The first operand can be a frame index where we'd normally expect a
// register.
assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
if (!MI.getOperand(1).isReg())
return false;
switch (MI.getOpcode()) {
case AArch64::SUBSWrr:
case AArch64::SUBSWrs:
case AArch64::SUBSWrx:
case AArch64::SUBSXrr:
case AArch64::SUBSXrs:
case AArch64::SUBSXrx:
case AArch64::ADDSWrr:
case AArch64::ADDSWrs:
case AArch64::ADDSWrx:
case AArch64::ADDSXrr:
case AArch64::ADDSXrs:
case AArch64::ADDSXrx:
// Replace SUBSWrr with SUBWrr if NZCV is not used.
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = MI.getOperand(2).getReg();
CmpMask = ~0;
CmpValue = 0;
return true;
case AArch64::SUBSWri:
case AArch64::ADDSWri:
case AArch64::SUBSXri:
case AArch64::ADDSXri:
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
// FIXME: In order to convert CmpValue to 0 or 1
CmpValue = MI.getOperand(2).getImm() != 0;
return true;
case AArch64::ANDSWri:
case AArch64::ANDSXri:
// ANDS does not use the same encoding scheme as the others xxxS
// instructions.
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
// FIXME:The return val type of decodeLogicalImmediate is uint64_t,
// while the type of CmpValue is int. When converting uint64_t to int,
// the high 32 bits of uint64_t will be lost.
// In fact it causes a bug in spec2006-483.xalancbmk
// CmpValue is only used to compare with zero in OptimizeCompareInstr
CmpValue = AArch64_AM::decodeLogicalImmediate(
MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
return true;
return false;
static bool UpdateOperandRegClass(MachineInstr &Instr) {
MachineBasicBlock *MBB = Instr.getParent();
assert(MBB && "Can't get MachineBasicBlock here");
MachineFunction *MF = MBB->getParent();
assert(MF && "Can't get MachineFunction here");
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
++OpIdx) {
MachineOperand &MO = Instr.getOperand(OpIdx);
const TargetRegisterClass *OpRegCstraints =
Instr.getRegClassConstraint(OpIdx, TII, TRI);
// If there's no constraint, there's nothing to do.
if (!OpRegCstraints)
// If the operand is a frame index, there's nothing to do here.
// A frame index operand will resolve correctly during PEI.
if (MO.isFI())
assert(MO.isReg() &&
"Operand has register constraints without being a register!");
Register Reg = MO.getReg();
if (Register::isPhysicalRegister(Reg)) {
if (!OpRegCstraints->contains(Reg))
return false;
} else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
!MRI->constrainRegClass(Reg, OpRegCstraints))
return false;
return true;
/// Return the opcode that does not set flags when possible - otherwise
/// return the original opcode. The caller is responsible to do the actual
/// substitution and legality checking.
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
// Don't convert all compare instructions, because for some the zero register
// encoding becomes the sp register.
bool MIDefinesZeroReg = false;
if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
MIDefinesZeroReg = true;
switch (MI.getOpcode()) {
return MI.getOpcode();
case AArch64::ADDSWrr:
return AArch64::ADDWrr;
case AArch64::ADDSWri:
return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
case AArch64::ADDSWrs:
return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
case AArch64::ADDSWrx:
return AArch64::ADDWrx;
case AArch64::ADDSXrr:
return AArch64::ADDXrr;
case AArch64::ADDSXri:
return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
case AArch64::ADDSXrs:
return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
case AArch64::ADDSXrx:
return AArch64::ADDXrx;
case AArch64::SUBSWrr:
return AArch64::SUBWrr;
case AArch64::SUBSWri:
return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
case AArch64::SUBSWrs:
return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
case AArch64::SUBSWrx:
return AArch64::SUBWrx;
case AArch64::SUBSXrr:
return AArch64::SUBXrr;
case AArch64::SUBSXri:
return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
case AArch64::SUBSXrs:
return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
case AArch64::SUBSXrx:
return AArch64::SUBXrx;
enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
/// True when condition flags are accessed (either by writing or reading)
/// on the instruction trace starting at From and ending at To.
/// Note: If From and To are from different blocks it's assumed CC are accessed
/// on the path.
static bool areCFlagsAccessedBetweenInstrs(
MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
// Early exit if To is at the beginning of the BB.
if (To == To->getParent()->begin())
return true;
// Check whether the instructions are in the same basic block
// If not, assume the condition flags might get modified somewhere.
if (To->getParent() != From->getParent())
return true;
// From must be above To.
assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
[From](MachineInstr &MI) {
return MI.getIterator() == From;
}) != To->getParent()->rend());
// We iterate backward starting at \p To until we hit \p From.
for (const MachineInstr &Instr :
instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
if (((AccessToCheck & AK_Write) &&
Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
return true;
return false;
/// Try to optimize a compare instruction. A compare instruction is an
/// instruction which produces AArch64::NZCV. It can be truly compare
/// instruction
/// when there are no uses of its destination register.
/// The following steps are tried in order:
/// 1. Convert CmpInstr into an unconditional version.
/// 2. Remove CmpInstr if above there is an instruction producing a needed
/// condition code or an instruction which can be converted into such an
/// instruction.
/// Only comparison with zero is supported.
bool AArch64InstrInfo::optimizeCompareInstr(
MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
int CmpValue, const MachineRegisterInfo *MRI) const {
// Replace SUBSWrr with SUBWrr if NZCV is not used.
int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
if (DeadNZCVIdx != -1) {
if (CmpInstr.definesRegister(AArch64::WZR) ||
CmpInstr.definesRegister(AArch64::XZR)) {
return true;
unsigned Opc = CmpInstr.getOpcode();
unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
if (NewOpc == Opc)
return false;
const MCInstrDesc &MCID = get(NewOpc);
bool succeeded = UpdateOperandRegClass(CmpInstr);
assert(succeeded && "Some operands reg class are incompatible!");
return true;
// Continue only if we have a "ri" where immediate is zero.
// FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
// function.
assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
if (CmpValue != 0 || SrcReg2 != 0)
return false;
// CmpInstr is a Compare instruction if destination register is not used.
if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
return false;
return substituteCmpToZero(CmpInstr, SrcReg, MRI);
/// Get opcode of S version of Instr.
/// If Instr is S version its opcode is returned.
/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
/// or we are not interested in it.
static unsigned sForm(MachineInstr &Instr) {
switch (Instr.getOpcode()) {
case AArch64::ADDSWrr:
case AArch64::ADDSWri:
case AArch64::ADDSXrr:
case AArch64::ADDSXri:
case AArch64::SUBSWrr:
case AArch64::SUBSWri:
case AArch64::SUBSXrr:
case AArch64::SUBSXri:
return Instr.getOpcode();
case AArch64::ADDWrr:
return AArch64::ADDSWrr;
case AArch64::ADDWri:
return AArch64::ADDSWri;
case AArch64::ADDXrr:
return AArch64::ADDSXrr;
case AArch64::ADDXri:
return AArch64::ADDSXri;
case AArch64::ADCWr:
return AArch64::ADCSWr;
case AArch64::ADCXr:
return AArch64::ADCSXr;
case AArch64::SUBWrr:
return AArch64::SUBSWrr;
case AArch64::SUBWri:
return AArch64::SUBSWri;
case AArch64::SUBXrr:
return AArch64::SUBSXrr;
case AArch64::SUBXri:
return AArch64::SUBSXri;
case AArch64::SBCWr:
return AArch64::SBCSWr;
case AArch64::SBCXr:
return AArch64::SBCSXr;
case AArch64::ANDWri:
return AArch64::ANDSWri;
case AArch64::ANDXri:
return AArch64::ANDSXri;
/// Check if AArch64::NZCV should be alive in successors of MBB.
static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
for (auto *BB : MBB->successors())
if (BB->isLiveIn(AArch64::NZCV))
return true;
return false;
namespace {
struct UsedNZCV {
bool N = false;
bool Z = false;
bool C = false;
bool V = false;
UsedNZCV() = default;
UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
this->N |= UsedFlags.N;
this->Z |= UsedFlags.Z;
this->C |= UsedFlags.C;
this->V |= UsedFlags.V;
return *this;
} // end anonymous namespace
/// Find a condition code used by the instruction.
/// Returns AArch64CC::Invalid if either the instruction does not use condition
/// codes or we don't optimize CmpInstr in the presence of such instructions.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
switch (Instr.getOpcode()) {
return AArch64CC::Invalid;
case AArch64::Bcc: {
int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
assert(Idx >= 2);
return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
case AArch64::CSINVWr:
case AArch64::CSINVXr:
case AArch64::CSINCWr:
case AArch64::CSINCXr:
case AArch64::CSELWr:
case AArch64::CSELXr:
case AArch64::CSNEGWr:
case AArch64::CSNEGXr:
case AArch64::FCSELSrrr:
case AArch64::FCSELDrrr: {
int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
assert(Idx >= 1);
return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
assert(CC != AArch64CC::Invalid);
UsedNZCV UsedFlags;
switch (CC) {
case AArch64CC::EQ: // Z set
case AArch64CC::NE: // Z clear
UsedFlags.Z = true;
case AArch64CC::HI: // Z clear and C set
case AArch64CC::LS: // Z set or C clear
UsedFlags.Z = true;
case AArch64CC::HS: // C set
case AArch64CC::LO: // C clear
UsedFlags.C = true;
case AArch64CC::MI: // N set
case AArch64CC::PL: // N clear
UsedFlags.N = true;
case AArch64CC::VS: // V set
case AArch64CC::VC: // V clear
UsedFlags.V = true;
case AArch64CC::GT: // Z clear, N and V the same
case AArch64CC::LE: // Z set, N and V differ
UsedFlags.Z = true;
case AArch64CC::GE: // N and V the same
case AArch64CC::LT: // N and V differ
UsedFlags.N = true;
UsedFlags.V = true;
return UsedFlags;
static bool isADDSRegImm(unsigned Opcode) {
return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
static bool isSUBSRegImm(unsigned Opcode) {
return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
/// Check if CmpInstr can be substituted by MI.
/// CmpInstr can be substituted:
/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
/// - and, MI and CmpInstr are from the same MachineBB
/// - and, condition flags are not alive in successors of the CmpInstr parent
/// - and, if MI opcode is the S form there must be no defs of flags between
/// MI and CmpInstr
/// or if MI opcode is not the S form there must be neither defs of flags
/// nor uses of flags between MI and CmpInstr.
/// - and C/V flags are not used after CmpInstr
static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
const TargetRegisterInfo *TRI) {
assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
const unsigned CmpOpcode = CmpInstr->getOpcode();
if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
return false;
if (MI->getParent() != CmpInstr->getParent())
return false;
if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
return false;
AccessKind AccessToCheck = AK_Write;
if (sForm(*MI) != MI->getOpcode())
AccessToCheck = AK_All;
if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
return false;
UsedNZCV NZCVUsedAfterCmp;
for (const MachineInstr &Instr :
CmpInstr->getParent()->instr_end())) {
if (Instr.readsRegister(AArch64::NZCV, TRI)) {
AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
return false;
NZCVUsedAfterCmp |= getUsedNZCV(CC);
if (Instr.modifiesRegister(AArch64::NZCV, TRI))
return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
/// Substitute an instruction comparing to zero with another instruction
/// which produces needed condition flags.
/// Return true on success.
bool AArch64InstrInfo::substituteCmpToZero(
MachineInstr &CmpInstr, unsigned SrcReg,
const MachineRegisterInfo *MRI) const {
// Get the unique definition of SrcReg.
MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
if (!MI)
return false;
const TargetRegisterInfo *TRI = &getRegisterInfo();
unsigned NewOpc = sForm(*MI);
if (NewOpc == AArch64::INSTRUCTION_LIST_END)
return false;
if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
return false;
// Update the instruction to set NZCV.
bool succeeded = UpdateOperandRegClass(*MI);
assert(succeeded && "Some operands reg class are incompatible!");
MI->addRegisterDefined(AArch64::NZCV, TRI);
return true;
bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
MI.getOpcode() != AArch64::CATCHRET)
return false;
MachineBasicBlock &MBB = *MI.getParent();
auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
auto TRI = Subtarget.getRegisterInfo();
DebugLoc DL = MI.getDebugLoc();
if (MI.getOpcode() == AArch64::CATCHRET) {
// Skip to the first instruction before the epilog.
const TargetInstrInfo *TII =
MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
auto MBBI = MachineBasicBlock::iterator(MI);
MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
FirstEpilogSEH != MBB.begin())
FirstEpilogSEH = std::prev(FirstEpilogSEH);
if (FirstEpilogSEH != MBB.begin())
FirstEpilogSEH = std::next(FirstEpilogSEH);
BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
.addReg(AArch64::X0, RegState::Define)
BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
.addReg(AArch64::X0, RegState::Define)
return true;
Register Reg = MI.getOperand(0).getReg();
const GlobalValue *GV =
const TargetMachine &TM = MBB.getParent()->getTarget();
unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
const unsigned char MO_NC = AArch64II::MO_NC;
if ((OpFlags & AArch64II::MO_GOT) != 0) {
BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
.addGlobalAddress(GV, 0, OpFlags);
if (Subtarget.isTargetILP32()) {
unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
.addDef(Reg32, RegState::Dead)
.addUse(Reg, RegState::Kill)
.addDef(Reg, RegState::Implicit);
} else {
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
.addReg(Reg, RegState::Kill)
} else if (TM.getCodeModel() == CodeModel::Large) {
assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
.addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
.addReg(Reg, RegState::Kill)
.addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
.addReg(Reg, RegState::Kill)
.addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
.addReg(Reg, RegState::Kill)
.addGlobalAddress(GV, 0, AArch64II::MO_G3)
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
.addReg(Reg, RegState::Kill)
} else if (TM.getCodeModel() == CodeModel::Tiny) {
BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
.addGlobalAddress(GV, 0, OpFlags);
} else {
BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
.addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
if (Subtarget.isTargetILP32()) {
unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
.addDef(Reg32, RegState::Dead)
.addUse(Reg, RegState::Kill)
.addGlobalAddress(GV, 0, LoFlags)
.addDef(Reg, RegState::Implicit);
} else {
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
.addReg(Reg, RegState::Kill)
.addGlobalAddress(GV, 0, LoFlags)
return true;
// Return true if this instruction simply sets its single destination register
// to zero. This is equivalent to a register rename of the zero-register.
bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case AArch64::MOVZWi:
case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
assert(MI.getDesc().getNumOperands() == 3 &&
MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
return true;
case AArch64::ANDWri: // and Rd, Rzr, #imm
return MI.getOperand(1).getReg() == AArch64::WZR;
case AArch64::ANDXri:
return MI.getOperand(1).getReg() == AArch64::XZR;
case TargetOpcode::COPY:
return MI.getOperand(1).getReg() == AArch64::WZR;
return false;
// Return true if this instruction simply renames a general register without
// modifying bits.
bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case TargetOpcode::COPY: {
// GPR32 copies will by lowered to ORRXrs
Register DstReg = MI.getOperand(0).getReg();
return (AArch64::GPR32RegClass.contains(DstReg) ||
case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
if (MI.getOperand(1).getReg() == AArch64::XZR) {
assert(MI.getDesc().getNumOperands() == 4 &&
MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
return true;
case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
if (MI.getOperand(2).getImm() == 0) {
assert(MI.getDesc().getNumOperands() == 4 &&
MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
return true;
return false;
// Return true if this instruction simply renames a general register without
// modifying bits.
bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case TargetOpcode::COPY: {
// FPR64 copies will by lowered to ORR.16b
Register DstReg = MI.getOperand(0).getReg();
return (AArch64::FPR64RegClass.contains(DstReg) ||
case AArch64::ORRv16i8:
if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
"invalid ORRv16i8 operands");
return true;
return false;
unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
switch (MI.getOpcode()) {
case AArch64::LDRWui:
case AArch64::LDRXui:
case AArch64::LDRBui:
case AArch64::LDRHui:
case AArch64::LDRSui:
case AArch64::LDRDui:
case AArch64::LDRQui:
if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
FrameIndex = MI.getOperand(1).getIndex();
return MI.getOperand(0).getReg();
return 0;
unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
switch (MI.getOpcode()) {
case AArch64::STRWui:
case AArch64::STRXui:
case AArch64::STRBui:
case AArch64::STRHui:
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
case AArch64::LDR_PXI:
case AArch64::STR_PXI:
if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
FrameIndex = MI.getOperand(1).getIndex();
return MI.getOperand(0).getReg();
return 0;
/// Check all MachineMemOperands for a hint to suppress pairing.
bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
return MMO->getFlags() & MOSuppressPair;
/// Set a flag on the first MachineMemOperand to suppress pairing.
void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
if (MI.memoperands_empty())
/// Check all MachineMemOperands for a hint that the load/store is strided.
bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
return MMO->getFlags() & MOStridedAccess;
bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
switch (Opc) {
return false;
case AArch64::STURSi:
case AArch64::STURDi:
case AArch64::STURQi:
case AArch64::STURBBi:
case AArch64::STURHHi:
case AArch64::STURWi:
case AArch64::STURXi:
case AArch64::LDURSi:
case AArch64::LDURDi:
case AArch64::LDURQi:
case AArch64::LDURWi:
case AArch64::LDURXi:
case AArch64::LDURSWi:
case AArch64::LDURHHi:
case AArch64::LDURBBi:
case AArch64::LDURSBWi:
case AArch64::LDURSHWi:
return true;
Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
switch (Opc) {
default: return {};
case AArch64::PRFMui: return AArch64::PRFUMi;
case AArch64::LDRXui: return AArch64::LDURXi;
case AArch64::LDRWui: return AArch64::LDURWi;
case AArch64::LDRBui: return AArch64::LDURBi;
case AArch64::LDRHui: return AArch64::LDURHi;
case AArch64::LDRSui: return AArch64::LDURSi;
case AArch64::LDRDui: return AArch64::LDURDi;
case AArch64::LDRQui: return AArch64::LDURQi;
case AArch64::LDRBBui: return AArch64::LDURBBi;
case AArch64::LDRHHui: return AArch64::LDURHHi;
case AArch64::LDRSBXui: return AArch64::LDURSBXi;
case AArch64::LDRSBWui: return AArch64::LDURSBWi;
case AArch64::LDRSHXui: return AArch64::LDURSHXi;
case AArch64::LDRSHWui: return AArch64::LDURSHWi;
case AArch64::LDRSWui: return AArch64::LDURSWi;
case AArch64::STRXui: return AArch64::STURXi;
case AArch64::STRWui: return AArch64::STURWi;
case AArch64::STRBui: return AArch64::STURBi;
case AArch64::STRHui: return AArch64::STURHi;
case AArch64::STRSui: return AArch64::STURSi;
case AArch64::STRDui: return AArch64::STURDi;
case AArch64::STRQui: return AArch64::STURQi;
case AArch64::STRBBui: return AArch64::STURBBi;
case AArch64::STRHHui: return AArch64::STURHHi;
unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
switch (Opc) {
return 2;
case AArch64::LDPXi:
case AArch64::LDPDi:
case AArch64::STPXi:
case AArch64::STPDi:
case AArch64::LDNPXi:
case AArch64::LDNPDi:
case AArch64::STNPXi:
case AArch64::STNPDi:
case AArch64::LDPQi:
case AArch64::STPQi:
case AArch64::LDNPQi:
case AArch64::STNPQi:
case AArch64::LDPWi:
case AArch64::LDPSi:
case AArch64::STPWi:
case AArch64::STPSi:
case AArch64::LDNPWi:
case AArch64::LDNPSi:
case AArch64::STNPWi:
case AArch64::STNPSi:
case AArch64::LDG:
case AArch64::STGPi:
case AArch64::LD1B_IMM:
case AArch64::LD1H_IMM:
case AArch64::LD1W_IMM:
case AArch64::LD1D_IMM:
case AArch64::ST1B_IMM:
case AArch64::ST1H_IMM:
case AArch64::ST1W_IMM:
case AArch64::ST1D_IMM:
case AArch64::LD1B_H_IMM:
case AArch64::LD1SB_H_IMM:
case AArch64::LD1H_S_IMM:
case AArch64::LD1SH_S_IMM:
case AArch64::LD1W_D_IMM:
case AArch64::LD1SW_D_IMM:
case AArch64::ST1B_H_IMM:
case AArch64::ST1H_S_IMM:
case AArch64::ST1W_D_IMM:
case AArch64::LD1B_S_IMM:
case AArch64::LD1SB_S_IMM:
case AArch64::LD1H_D_IMM:
case AArch64::LD1SH_D_IMM:
case AArch64::ST1B_S_IMM:
case AArch64::ST1H_D_IMM:
case AArch64::LD1B_D_IMM:
case AArch64::LD1SB_D_IMM:
case AArch64::ST1B_D_IMM:
return 3;
case AArch64::ADDG:
case AArch64::STGOffset:
case AArch64::LDR_PXI:
case AArch64::STR_PXI:
return 2;
bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
switch (MI.getOpcode()) {
return false;
// Scaled instructions.
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
case AArch64::STRXui:
case AArch64::STRWui:
case AArch64::LDRSui:
case AArch64::LDRDui:
case AArch64::LDRQui:
case AArch64::LDRXui:
case AArch64::LDRWui:
case AArch64::LDRSWui:
// Unscaled instructions.
case AArch64::STURSi:
case AArch64::STURDi:
case AArch64::STURQi:
case AArch64::STURWi:
case AArch64::STURXi:
case AArch64::LDURSi:
case AArch64::LDURDi:
case AArch64::LDURQi:
case AArch64::LDURWi:
case AArch64::LDURXi:
case AArch64::LDURSWi:
return true;
unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
bool &Is64Bit) {
switch (Opc) {
llvm_unreachable("Opcode has no flag setting equivalent!");
// 32-bit cases:
case AArch64::ADDWri:
Is64Bit = false;
return AArch64::ADDSWri;
case AArch64::ADDWrr:
Is64Bit = false;
return AArch64::ADDSWrr;
case AArch64::ADDWrs:
Is64Bit = false;
return AArch64::ADDSWrs;
case AArch64::ADDWrx:
Is64Bit = false;
return AArch64::ADDSWrx;
case AArch64::ANDWri:
Is64Bit = false;
return AArch64::ANDSWri;
case AArch64::ANDWrr:
Is64Bit = false;
return AArch64::ANDSWrr;
case AArch64::ANDWrs:
Is64Bit = false;
return AArch64::ANDSWrs;
case AArch64::BICWrr:
Is64Bit = false;
return AArch64::BICSWrr;
case AArch64::BICWrs:
Is64Bit = false;
return AArch64::BICSWrs;
case AArch64::SUBWri:
Is64Bit = false;
return AArch64::SUBSWri;
case AArch64::SUBWrr:
Is64Bit = false;
return AArch64::SUBSWrr;
case AArch64::SUBWrs:
Is64Bit = false;
return AArch64::SUBSWrs;
case AArch64::SUBWrx:
Is64Bit = false;
return AArch64::SUBSWrx;
// 64-bit cases:
case AArch64::ADDXri:
Is64Bit = true;
return AArch64::ADDSXri;
case AArch64::ADDXrr:
Is64Bit = true;
return AArch64::ADDSXrr;
case AArch64::ADDXrs:
Is64Bit = true;
return AArch64::ADDSXrs;
case AArch64::ADDXrx:
Is64Bit = true;
return AArch64::ADDSXrx;
case AArch64::ANDXri:
Is64Bit = true;
return AArch64::ANDSXri;
case AArch64::ANDXrr:
Is64Bit = true;
return AArch64::ANDSXrr;
case AArch64::ANDXrs:
Is64Bit = true;
return AArch64::ANDSXrs;
case AArch64::BICXrr:
Is64Bit = true;
return AArch64::BICSXrr;
case AArch64::BICXrs:
Is64Bit = true;
return AArch64::BICSXrs;
case AArch64::SUBXri:
Is64Bit = true;
return AArch64::SUBSXri;
case AArch64::SUBXrr:
Is64Bit = true;
return AArch64::SUBSXrr;
case AArch64::SUBXrs:
Is64Bit = true;
return AArch64::SUBSXrs;
case AArch64::SUBXrx:
Is64Bit = true;
return AArch64::SUBSXrx;
// Is this a candidate for ld/st merging or pairing? For example, we don't
// touch volatiles or load/stores that have a hint to avoid pair formation.
bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
// If this is a volatile load/store, don't mess with it.
if (MI.hasOrderedMemoryRef())
return false;
// Make sure this is a reg/fi+imm (as opposed to an address reloc).
assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
"Expected a reg or frame index operand.");
if (!MI.getOperand(2).isImm())
return false;
// Can't merge/pair if the instruction modifies the base register.
// e.g., ldr x0, [x0]
// This case will never occur with an FI base.
if (MI.getOperand(1).isReg()) {
Register BaseReg = MI.getOperand(1).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (MI.modifiesRegister(BaseReg, TRI))
return false;
// Check if this load/store has a hint to avoid pair formation.
// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
if (isLdStPairSuppressed(MI))
return false;
// Do not pair any callee-save store/reload instructions in the
// prologue/epilogue if the CFI information encoded the operations as separate
// instructions, as that will cause the size of the actual prologue to mismatch
// with the prologue size recorded in the Windows CFI.
const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
bool NeedsWinCFI = MAI->usesWindowsCFI() &&
if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
return false;
// On some CPUs quad load/store pairs are slower than two single load/stores.
if (Subtarget.isPaired128Slow()) {
switch (MI.getOpcode()) {
case AArch64::LDURQi:
case AArch64::STURQi:
case AArch64::LDRQui:
case AArch64::STRQui:
return false;
return true;
bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
const TargetRegisterInfo *TRI) const {
if (!LdSt.mayLoadOrStore())
return false;
const MachineOperand *BaseOp;
if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
Width, TRI))
return false;
return true;
bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
bool &OffsetIsScalable, unsigned &Width,
const TargetRegisterInfo *TRI) const {
assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
// Handle only loads/stores with base register followed by immediate offset.
if (LdSt.getNumExplicitOperands() == 3) {
// Non-paired instruction (e.g., ldr x1, [x0, #8]).
if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
return false;
} else if (LdSt.getNumExplicitOperands() == 4) {
// Paired instruction (e.g., ldp x1, x2, [x0, #8]).
if (!LdSt.getOperand(1).isReg() ||
(!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
return false;
} else
return false;
// Get the scaling factor for the instruction and set the width for the
// instruction.
TypeSize Scale(0U, false);
int64_t Dummy1, Dummy2;
// If this returns false, then it's an instruction we don't want to handle.
if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
return false;
// Compute the offset. Offset is calculated as the immediate operand
// multiplied by the scaling factor. Unscaled instructions have scaling factor
// set to 1.
if (LdSt.getNumExplicitOperands() == 3) {
BaseOp = &LdSt.getOperand(1);
Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
} else {
assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
BaseOp = &LdSt.getOperand(2);
Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
OffsetIsScalable = Scale.isScalable();
if (!BaseOp->isReg() && !BaseOp->isFI())
return false;
return true;
MachineOperand &
AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
return OfsOp;
bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
unsigned &Width, int64_t &MinOffset,
int64_t &MaxOffset) {
const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
switch (Opcode) {
// Not a memory operation or something we want to handle.
Scale = TypeSize::Fixed(0);
Width = 0;
MinOffset = MaxOffset = 0;
return false;
case AArch64::STRWpost:
case AArch64::LDRWpost:
Width = 32;
Scale = TypeSize::Fixed(4);
MinOffset = -256;
MaxOffset = 255;
case AArch64::LDURQi:
case AArch64::STURQi:
Width = 16;
Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
case AArch64::PRFUMi:
case AArch64::LDURXi:
case AArch64::LDURDi:
case AArch64::STURXi:
case AArch64::STURDi:
Width = 8;
Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
case AArch64::LDURWi:
case AArch64::LDURSi:
case AArch64::LDURSWi:
case AArch64::STURWi:
case AArch64::STURSi:
Width = 4;
Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
case AArch64::LDURHi:
case AArch64::LDURHHi:
case AArch64::LDURSHXi:
case AArch64::LDURSHWi:
case AArch64::STURHi:
case AArch64::STURHHi:
Width = 2;
Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
case AArch64::LDURBi:
case AArch64::LDURBBi:
case AArch64::LDURSBXi:
case AArch64::LDURSBWi:
case AArch64::STURBi:
case AArch64::STURBBi:
Width = 1;
Scale = TypeSize::Fixed(1);
MinOffset = -256;
MaxOffset = 255;
case AArch64::LDPQi:
case AArch64::LDNPQi:
case AArch64::STPQi:
case AArch64::STNPQi:
Scale = TypeSize::Fixed(16);
Width = 32;
MinOffset = -64;
MaxOffset = 63;
case AArch64::LDRQui:
case AArch64::STRQui:
Scale = TypeSize::Fixed(16);
Width = 16;
MinOffset = 0;
MaxOffset = 4095;
case AArch64::LDPXi:
case AArch64::LDPDi:
case AArch64::LDNPXi:
case AArch64::LDNPDi:
case AArch64::STPXi:
case AArch64::STPDi:
case AArch64::STNPXi:
case AArch64::STNPDi:
Scale = TypeSize::Fixed(8);
Width = 16;
MinOffset = -64;
MaxOffset = 63;
case AArch64::PRFMui:
case AArch64::LDRXui:
case AArch64::LDRDui:
case AArch64::STRXui:
case AArch64::STRDui:
Scale = TypeSize::Fixed(8);
Width = 8;
MinOffset = 0;
MaxOffset = 4095;
case AArch64::LDPWi:
case AArch64::LDPSi:
case AArch64::LDNPWi:
case AArch64::LDNPSi:
case AArch64::STPWi:
case AArch64::STPSi:
case AArch64::STNPWi:
case AArch64::STNPSi:
Scale = TypeSize::Fixed(4);
Width = 8;
MinOffset = -64;
MaxOffset = 63;
case AArch64::LDRWui:
case AArch64::LDRSui:
case AArch64::LDRSWui:
case AArch64::STRWui:
case AArch64::STRSui:
Scale = TypeSize::Fixed(4);
Width = 4;
MinOffset = 0;
MaxOffset = 4095;
case AArch64::LDRHui:
case AArch64::LDRHHui:
case AArch64::LDRSHWui:
case AArch64::LDRSHXui:
case AArch64::STRHui:
case AArch64::STRHHui:
Scale = TypeSize::Fixed(2);
Width = 2;
MinOffset = 0;
MaxOffset = 4095;
case AArch64::LDRBui:
case AArch64::LDRBBui:
case AArch64::LDRSBWui:
case AArch64::LDRSBXui:
case AArch64::STRBui:
case AArch64::STRBBui:
Scale = TypeSize::Fixed(1);
Width = 1;
MinOffset = 0;
MaxOffset = 4095;
case AArch64::ADDG:
Scale = TypeSize::Fixed(16);
Width = 0;
MinOffset = 0;
MaxOffset = 63;
case AArch64::TAGPstack:
Scale = TypeSize::Fixed(16);
Width = 0;
// TAGP with a negative offset turns into SUBP, which has a maximum offset
// of 63 (not 64!).
MinOffset = -63;
MaxOffset = 63;
case AArch64::LDG:
case AArch64::STGOffset:
case AArch64::STZGOffset:
Scale = TypeSize::Fixed(16);
Width = 16;
MinOffset = -256;
MaxOffset = 255;
case AArch64::STR_ZZZZXI:
case AArch64::LDR_ZZZZXI:
Scale = TypeSize::Scalable(16);
Width = SVEMaxBytesPerVector * 4;
MinOffset = -256;
MaxOffset = 252;
case AArch64::STR_ZZZXI:
case AArch64::LDR_ZZZXI:
Scale = TypeSize::Scalable(16);
Width = SVEMaxBytesPerVector * 3;
MinOffset = -256;
MaxOffset = 253;
case AArch64::STR_ZZXI:
case AArch64::LDR_ZZXI:
Scale = TypeSize::Scalable(16);
Width = SVEMaxBytesPerVector * 2;
MinOffset = -256;
MaxOffset = 254;
case AArch64::LDR_PXI:
case AArch64::STR_PXI:
Scale = TypeSize::Scalable(2);
Width = SVEMaxBytesPerVector / 8;
MinOffset = -256;
MaxOffset = 255;
case AArch64::LDR_ZXI:
case AArch64::STR_ZXI:
Scale = TypeSize::Scalable(16);
Width = SVEMaxBytesPerVector;
MinOffset = -256;
MaxOffset = 255;
case AArch64::LD1B_IMM:
case AArch64::LD1H_IMM:
case AArch64::LD1W_IMM:
case AArch64::LD1D_IMM:
case AArch64::ST1B_IMM:
case AArch64::ST1H_IMM:
case AArch64::ST1W_IMM:
case AArch64::ST1D_IMM:
// A full vectors worth of data
// Width = mbytes * elements
Scale = TypeSize::Scalable(16);
Width = SVEMaxBytesPerVector;
MinOffset = -8;
MaxOffset = 7;
case AArch64::LD1B_H_IMM:
case AArch64::LD1SB_H_IMM:
case AArch64::LD1H_S_IMM:
case AArch64::LD1SH_S_IMM:
case AArch64::LD1W_D_IMM:
case AArch64::LD1SW_D_IMM:
case AArch64::ST1B_H_IMM:
case AArch64::ST1H_S_IMM:
case AArch64::ST1W_D_IMM:
// A half vector worth of data
// Width = mbytes * elements
Scale = TypeSize::Scalable(8);
Width = SVEMaxBytesPerVector / 2;
MinOffset = -8;
MaxOffset = 7;
case AArch64::LD1B_S_IMM:
case AArch64::LD1SB_S_IMM:
case AArch64::LD1H_D_IMM:
case AArch64::LD1SH_D_IMM:
case AArch64::ST1B_S_IMM:
case AArch64::ST1H_D_IMM:
// A quarter vector worth of data
// Width = mbytes * elements
Scale = TypeSize::Scalable(4);
Width = SVEMaxBytesPerVector / 4;
MinOffset = -8;
MaxOffset = 7;
case AArch64::LD1B_D_IMM:
case AArch64::LD1SB_D_IMM:
case AArch64::ST1B_D_IMM:
// A eighth vector worth of data
// Width = mbytes * elements
Scale = TypeSize::Scalable(2);
Width = SVEMaxBytesPerVector / 8;
MinOffset = -8;
MaxOffset = 7;
case AArch64::ST2GOffset:
case AArch64::STZ2GOffset:
Scale = TypeSize::Fixed(16);
Width = 32;
MinOffset = -256;
MaxOffset = 255;
case AArch64::STGPi:
Scale = TypeSize::Fixed(16);
Width = 16;
MinOffset = -64;
MaxOffset = 63;
return true;
// Scaling factor for unscaled load or store.
int AArch64InstrInfo::getMemScale(unsigned Opc) {
switch (Opc) {
llvm_unreachable("Opcode has unknown scale!");
case AArch64::LDRBBui:
case AArch64::LDURBBi:
case AArch64::LDRSBWui:
case AArch64::LDURSBWi:
case AArch64::STRBBui:
case AArch64::STURBBi:
return 1;
case AArch64::LDRHHui:
case AArch64::LDURHHi:
case AArch64::LDRSHWui:
case AArch64::LDURSHWi:
case AArch64::STRHHui:
case AArch64::STURHHi:
return 2;
case AArch64::LDRSui:
case AArch64::LDURSi:
case AArch64::LDRSWui:
case AArch64::LDURSWi:
case AArch64::LDRWui:
case AArch64::LDURWi:
case AArch64::STRSui:
case AArch64::STURSi:
case AArch64::STRWui:
case AArch64::STURWi:
case AArch64::LDPSi:
case AArch64::LDPSWi:
case AArch64::LDPWi:
case AArch64::STPSi:
case AArch64::STPWi:
return 4;
case AArch64::LDRDui:
case AArch64::LDURDi:
case AArch64::LDRXui:
case AArch64::LDURXi:
case AArch64::STRDui:
case AArch64::STURDi:
case AArch64::STRXui:
case AArch64::STURXi:
case AArch64::LDPDi:
case AArch64::LDPXi:
case AArch64::STPDi:
case AArch64::STPXi:
return 8;
case AArch64::LDRQui:
case AArch64::LDURQi:
case AArch64::STRQui:
case AArch64::STURQi:
case AArch64::LDPQi:
case AArch64::STPQi:
case AArch64::STGOffset:
case AArch64::STZGOffset:
case AArch64::ST2GOffset:
case AArch64::STZ2GOffset:
case AArch64::STGPi:
return 16;
// Scale the unscaled offsets. Returns false if the unscaled offset can't be
// scaled.
static bool scaleOffset(unsigned Opc, int64_t &Offset) {
int Scale = AArch64InstrInfo::getMemScale(Opc);
// If the byte-offset isn't a multiple of the stride, we can't scale this
// offset.
if (Offset % Scale != 0)
return false;
// Convert the byte-offset used by unscaled into an "element" offset used
// by the scaled pair load/store instructions.
Offset /= Scale;
return true;
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
if (FirstOpc == SecondOpc)
return true;
// We can also pair sign-ext and zero-ext instructions.
switch (FirstOpc) {
return false;
case AArch64::LDRWui:
case AArch64::LDURWi:
return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
case AArch64::LDRSWui:
case AArch64::LDURSWi:
return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
// These instructions can't be paired based on their opcodes.
return false;
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
int64_t Offset1, unsigned Opcode1, int FI2,
int64_t Offset2, unsigned Opcode2) {
// Accesses through fixed stack object frame indices may access a different
// fixed stack slot. Check that the object offsets + offsets match.
if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
// Convert to scaled object offsets.
int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
if (ObjectOffset1 % Scale1 != 0)
return false;
ObjectOffset1 /= Scale1;
int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
if (ObjectOffset2 % Scale2 != 0)
return false;
ObjectOffset2 /= Scale2;
ObjectOffset1 += Offset1;
ObjectOffset2 += Offset2;
return ObjectOffset1 + 1 == ObjectOffset2;
return FI1 == FI2;
/// Detect opportunities for ldp/stp formation.
/// Only called for LdSt for which getMemOperandWithOffset returns true.
bool AArch64InstrInfo::shouldClusterMemOps(
ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
unsigned NumBytes) const {
assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
const MachineOperand &BaseOp1 = *BaseOps1.front();
const MachineOperand &BaseOp2 = *BaseOps2.front();
const MachineInstr &FirstLdSt = *BaseOp1.getParent();
const MachineInstr &SecondLdSt = *BaseOp2.getParent();
if (BaseOp1.getType() != BaseOp2.getType())
return false;
assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
"Only base registers and frame indices are supported.");
// Check for both base regs and base FI.
if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
return false;
// Only cluster up to a single pair.
if (NumLoads > 2)
return false;
if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
return false;
// Can we pair these instructions based on their opcodes?
unsigned FirstOpc = FirstLdSt.getOpcode();
unsigned SecondOpc = SecondLdSt.getOpcode();
if (!canPairLdStOpc(FirstOpc, SecondOpc))
return false;
// Can't merge volatiles or load/stores that have a hint to avoid pair
// formation, for example.
if (!isCandidateToMergeOrPair(FirstLdSt) ||
return false;
// isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
return false;
int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
return false;
// Pairwise instructions have a 7-bit signed offset field.
if (Offset1 > 63 || Offset1 < -64)
return false;
// The caller should already have ordered First/SecondLdSt by offset.
// Note: except for non-equal frame index bases
if (BaseOp1.isFI()) {
assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
"Caller should have ordered offsets.");
const MachineFrameInfo &MFI =
return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
BaseOp2.getIndex(), Offset2, SecondOpc);
assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
return Offset1 + 1 == Offset2;
static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
unsigned Reg, unsigned SubIdx,
unsigned State,
const TargetRegisterInfo *TRI) {
if (!SubIdx)
return MIB.addReg(Reg, State);
if (Register::isPhysicalRegister(Reg))
return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
return MIB.addReg(Reg, State, SubIdx);
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
unsigned NumRegs) {
// We really want the positive remainder mod 32 here, that happens to be
// easily obtainable with a mask.
return ((DestReg - SrcReg) & 0x1f) < NumRegs;
void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc,
unsigned Opcode,
ArrayRef<unsigned> Indices) const {
assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
const TargetRegisterInfo *TRI = &getRegisterInfo();
uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
unsigned NumRegs = Indices.size();
int SubReg = 0, End = NumRegs, Incr = 1;
if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
SubReg = NumRegs - 1;
End = -1;
Incr = -1;
for (; SubReg != End; SubReg += Incr) {
const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
DebugLoc DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc,
unsigned Opcode, unsigned ZeroReg,
llvm::ArrayRef<unsigned> Indices) const {
const TargetRegisterInfo *TRI = &getRegisterInfo();
unsigned NumRegs = Indices.size();
#ifndef NDEBUG
uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
"GPR reg sequences should not be able to overlap");
for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc) const {
if (AArch64::GPR32spRegClass.contains(DestReg) &&
(AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
// If either operand is WSP, expand to ADD #0.
if (Subtarget.hasZeroCycleRegMove()) {
// Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
MCRegister DestRegX = TRI->getMatchingSuperReg(
DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
MCRegister SrcRegX = TRI->getMatchingSuperReg(
SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
// This instruction is reading and writing X registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
// that we are reading an undefined value from SrcRegX, but a proper
// value from SrcReg.
BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
.addReg(SrcRegX, RegState::Undef)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
} else {
BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else {
if (Subtarget.hasZeroCycleRegMove()) {
// Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
MCRegister DestRegX = TRI->getMatchingSuperReg(
DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
MCRegister SrcRegX = TRI->getMatchingSuperReg(
SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
// This instruction is reading and writing X registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
// that we are reading an undefined value from SrcRegX, but a proper
// value from SrcReg.
BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
.addReg(SrcRegX, RegState::Undef)
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
} else {
// Otherwise, expand to ORR WZR.
BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
// Copy a Predicate register by ORRing with itself.
if (AArch64::PPRRegClass.contains(DestReg) &&
AArch64::PPRRegClass.contains(SrcReg)) {
assert(Subtarget.hasSVE() && "Unexpected SVE register.");
BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
.addReg(SrcReg) // Pg
.addReg(SrcReg, getKillRegState(KillSrc));
// Copy a Z register by ORRing with itself.
if (AArch64::ZPRRegClass.contains(DestReg) &&
AArch64::ZPRRegClass.contains(SrcReg)) {
assert(Subtarget.hasSVE() && "Unexpected SVE register.");
BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
+ // Copy a Z register pair by copying the individual sub-registers.
+ if (AArch64::ZPR2RegClass.contains(DestReg) &&
+ AArch64::ZPR2RegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
+ Indices);
+ return;
+ }
+ // Copy a Z register triple by copying the individual sub-registers.
+ if (AArch64::ZPR3RegClass.contains(DestReg) &&
+ AArch64::ZPR3RegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
+ AArch64::zsub2};
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
+ Indices);
+ return;
+ }
+ // Copy a Z register quad by copying the individual sub-registers.
+ if (AArch64::ZPR4RegClass.contains(DestReg) &&
+ AArch64::ZPR4RegClass.contains(SrcReg)) {
+ static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
+ AArch64::zsub2, AArch64::zsub3};
+ copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
+ Indices);
+ return;
+ }
if (AArch64::GPR64spRegClass.contains(DestReg) &&
(AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
// If either operand is SP, expand to ADD #0.
BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else {
// Otherwise, expand to ORR XZR.
BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
// Copy a DDDD register quad by copying the individual sub-registers.
if (AArch64::DDDDRegClass.contains(DestReg) &&
AArch64::DDDDRegClass.contains(SrcReg)) {
static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
AArch64::dsub2, AArch64::dsub3};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
// Copy a DDD register triple by copying the individual sub-registers.
if (AArch64::DDDRegClass.contains(DestReg) &&
AArch64::DDDRegClass.contains(SrcReg)) {
static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
// Copy a DD register pair by copying the individual sub-registers.
if (AArch64::DDRegClass.contains(DestReg) &&
AArch64::DDRegClass.contains(SrcReg)) {
static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
// Copy a QQQQ register quad by copying the individual sub-registers.
if (AArch64::QQQQRegClass.contains(DestReg) &&
AArch64::QQQQRegClass.contains(SrcReg)) {
static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
AArch64::qsub2, AArch64::qsub3};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
// Copy a QQQ register triple by copying the individual sub-registers.
if (AArch64::QQQRegClass.contains(DestReg) &&
AArch64::QQQRegClass.contains(SrcReg)) {
static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
// Copy a QQ register pair by copying the individual sub-registers.
if (AArch64::QQRegClass.contains(DestReg) &&
AArch64::QQRegClass.contains(SrcReg)) {
static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
AArch64::XZR, Indices);
if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
AArch64::WZR, Indices);
if (AArch64::FPR128RegClass.contains(DestReg) &&
AArch64::FPR128RegClass.contains(SrcReg)) {
if (Subtarget.hasNEON()) {
BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
} else {
BuildMI(MBB, I, DL, get(AArch64::STRQpre))
.addReg(AArch64::SP, RegState::Define)
.addReg(SrcReg, getKillRegState(KillSrc))
BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
.addReg(AArch64::SP, RegState::Define)
.addReg(DestReg, RegState::Define)
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::FPR64RegClass.contains(SrcReg)) {
if (Subtarget.hasNEON()) {
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
} else {
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
if (Subtarget.hasNEON()) {
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
} else {
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
if (Subtarget.hasNEON()) {
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
} else {
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
if (Subtarget.hasNEON()) {
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
} else {
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
// Copies between GPR64 and FPR64.
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::GPR64RegClass.contains(SrcReg)) {
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (AArch64::GPR64RegClass.contains(DestReg) &&
AArch64::FPR64RegClass.contains(SrcReg)) {
BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
// Copies between GPR32 and FPR32.
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::GPR32RegClass.contains(SrcReg)) {
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (AArch64::GPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (DestReg == AArch64::NZCV) {
assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
BuildMI(MBB, I, DL, get(AArch64::MSR))
.addReg(SrcReg, getKillRegState(KillSrc))
.addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
if (SrcReg == AArch64::NZCV) {
assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
.addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
llvm_unreachable("unimplemented reg-to-reg copy");
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const MCInstrDesc &MCID,
Register SrcReg, bool IsKill,
unsigned SubIdx0, unsigned SubIdx1, int FI,
MachineMemOperand *MMO) {
Register SrcReg0 = SrcReg;
Register SrcReg1 = SrcReg;
if (Register::isPhysicalRegister(SrcReg)) {
SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
SubIdx0 = 0;
SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
SubIdx1 = 0;
BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
.addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
.addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
void AArch64InstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
bool isKill, int FI, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
unsigned Opc = 0;
bool Offset = true;
unsigned StackID = TargetStackID::Default;
switch (TRI->getSpillSize(*RC)) {
case 1:
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
Opc = AArch64::STRBui;
case 2:
if (AArch64::FPR16RegClass.hasSubClassEq(RC))
Opc = AArch64::STRHui;
else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_PXI;
StackID = TargetStackID::SVEVector;
case 4:
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::STRWui;
if (Register::isVirtualRegister(SrcReg))
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
assert(SrcReg != AArch64::WSP);
} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
Opc = AArch64::STRSui;
case 8:
if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::STRXui;
if (Register::isVirtualRegister(SrcReg))
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
assert(SrcReg != AArch64::SP);
} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
Opc = AArch64::STRDui;
} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
get(AArch64::STPWi), SrcReg, isKill,
AArch64::sube32, AArch64::subo32, FI, MMO);
case 16:
if (AArch64::FPR128RegClass.hasSubClassEq(RC))
Opc = AArch64::STRQui;
else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Twov1d;
Offset = false;
} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
get(AArch64::STPXi), SrcReg, isKill,
AArch64::sube64, AArch64::subo64, FI, MMO);
} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_ZXI;
StackID = TargetStackID::SVEVector;
case 24:
if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Threev1d;
Offset = false;
case 32:
if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Fourv1d;
Offset = false;
} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Twov2d;
Offset = false;
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_ZZXI;
StackID = TargetStackID::SVEVector;
case 48:
if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Threev2d;
Offset = false;
} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_ZZZXI;
StackID = TargetStackID::SVEVector;
case 64:
if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Fourv2d;
Offset = false;
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
Opc = AArch64::STR_ZZZZXI;
StackID = TargetStackID::SVEVector;
assert(Opc && "Unknown register class");
MFI.setStackID(FI, StackID);
const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
.addReg(SrcReg, getKillRegState(isKill))
if (Offset)
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertBefore,
const MCInstrDesc &MCID,
Register DestReg, unsigned SubIdx0,
unsigned SubIdx1, int FI,
MachineMemOperand *MMO) {
Register DestReg0 = DestReg;
Register DestReg1 = DestReg;
bool IsUndef = true;
if (Register::isPhysicalRegister(DestReg)) {
DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
SubIdx0 = 0;
DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
SubIdx1 = 0;
IsUndef = false;
BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
.addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
.addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
void AArch64InstrInfo::loadRegFromStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
int FI, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
unsigned Opc = 0;
bool Offset = true;
unsigned StackID = TargetStackID::Default;
switch (TRI->getSpillSize(*RC)) {
case 1:
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
Opc = AArch64::LDRBui;
case 2:
if (AArch64::FPR16RegClass.hasSubClassEq(RC))
Opc = AArch64::LDRHui;
else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_PXI;
StackID = TargetStackID::SVEVector;
case 4:
if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::LDRWui;
if (Register::isVirtualRegister(DestReg))
MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
assert(DestReg != AArch64::WSP);
} else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
Opc = AArch64::LDRSui;
case 8:
if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
Opc = AArch64::LDRXui;
if (Register::isVirtualRegister(DestReg))
MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
assert(DestReg != AArch64::SP);
} else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
Opc = AArch64::LDRDui;
} else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
get(AArch64::LDPWi), DestReg, AArch64::sube32,
AArch64::subo32, FI, MMO);
case 16:
if (AArch64::FPR128RegClass.hasSubClassEq(RC))
Opc = AArch64::LDRQui;
else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Twov1d;
Offset = false;
} else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
get(AArch64::LDPXi), DestReg, AArch64::sube64,
AArch64::subo64, FI, MMO);
} else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_ZXI;
StackID = TargetStackID::SVEVector;
case 24:
if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Threev1d;
Offset = false;
case 32:
if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Fourv1d;
Offset = false;
} else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Twov2d;
Offset = false;
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_ZZXI;
StackID = TargetStackID::SVEVector;
case 48:
if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Threev2d;
Offset = false;
} else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_ZZZXI;
StackID = TargetStackID::SVEVector;
case 64:
if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Fourv2d;
Offset = false;
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
Opc = AArch64::LDR_ZZZZXI;
StackID = TargetStackID::SVEVector;
assert(Opc && "Unknown register class");
MFI.setStackID(FI, StackID);
const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
.addReg(DestReg, getDefRegState(true))
if (Offset)
bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
const MachineInstr &UseMI,
const TargetRegisterInfo *TRI) {
return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
[TRI](const MachineInstr &I) {
return I.modifiesRegister(AArch64::NZCV, TRI) ||
I.readsRegister(AArch64::NZCV, TRI);
// Helper function to emit a frame offset adjustment from a given
// pointer (SrcReg), stored into DestReg. This function is explicit
// in that it requires the opcode.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, int64_t Offset, unsigned Opc,
const TargetInstrInfo *TII,
MachineInstr::MIFlag Flag, bool NeedsWinCFI,
bool *HasWinCFI) {
int Sign = 1;
unsigned MaxEncoding, ShiftSize;
switch (Opc) {
case AArch64::ADDXri:
case AArch64::ADDSXri:
case AArch64::SUBXri:
case AArch64::SUBSXri:
MaxEncoding = 0xfff;
ShiftSize = 12;
case AArch64::ADDVL_XXI:
case AArch64::ADDPL_XXI:
MaxEncoding = 31;
ShiftSize = 0;
if (Offset < 0) {
MaxEncoding = 32;
Sign = -1;
Offset = -Offset;
llvm_unreachable("Unsupported opcode");
// FIXME: If the offset won't fit in 24-bits, compute the offset into a
// scratch register. If DestReg is a virtual register, use it as the
// scratch register; otherwise, create a new virtual register (to be
// replaced by the scavenger at the end of PEI). That case can be optimized
// slightly if DestReg is SP which is always 16-byte aligned, so the scratch
// register can be loaded with offset%8 and the add/sub can use an extending
// instruction with LSL#3.
// Currently the function handles any offsets but generates a poor sequence
// of code.
// assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
Register TmpReg = DestReg;
if (TmpReg == AArch64::XZR)
TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
do {
uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
unsigned LocalShiftSize = 0;
if (ThisVal > MaxEncoding) {
ThisVal = ThisVal >> ShiftSize;
LocalShiftSize = ShiftSize;
assert((ThisVal >> ShiftSize) <= MaxEncoding &&
"Encoding cannot handle value that big");
Offset -= ThisVal << LocalShiftSize;
if (Offset == 0)
TmpReg = DestReg;
auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
.addImm(Sign * (int)ThisVal);
if (ShiftSize)
MBI = MBI.addImm(
AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
MBI = MBI.setMIFlag(Flag);
if (NeedsWinCFI) {
assert(Sign == 1 && "SEH directives should always have a positive sign");
int Imm = (int)(ThisVal << LocalShiftSize);
if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
(SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
if (HasWinCFI)
*HasWinCFI = true;
if (Imm == 0)
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
assert(Offset == 0 && "Expected remaining offset to be zero to "
"emit a single SEH directive");
} else if (DestReg == AArch64::SP) {
if (HasWinCFI)
*HasWinCFI = true;
assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
if (HasWinCFI)
*HasWinCFI = true;
SrcReg = TmpReg;
} while (Offset);
void llvm::emitFrameOffset(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
unsigned DestReg, unsigned SrcReg,
StackOffset Offset, const TargetInstrInfo *TII,
MachineInstr::MIFlag Flag, bool SetNZCV,
bool NeedsWinCFI, bool *HasWinCFI) {
int64_t Bytes, NumPredicateVectors, NumDataVectors;
Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
// First emit non-scalable frame offsets, or a simple 'mov'.
if (Bytes || (!Offset && SrcReg != DestReg)) {
assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
"SP increment/decrement not 16-byte aligned");
unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
if (Bytes < 0) {
Bytes = -Bytes;
Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
NeedsWinCFI, HasWinCFI);
SrcReg = DestReg;
assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
"SetNZCV not supported with SVE vectors");
assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
"WinCFI not supported with SVE vectors");
if (NumDataVectors) {
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
SrcReg = DestReg;
if (NumPredicateVectors) {
assert(DestReg != AArch64::SP && "Unaligned access to SP");
emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
LiveIntervals *LIS, VirtRegMap *VRM) const {
// This is a bit of a hack. Consider this instruction:
// %0 = COPY %sp; GPR64all:%0
// We explicitly chose GPR64all for the virtual register so such a copy might
// be eliminated by RegisterCoalescer. However, that may not be possible, and
// %0 may even spill. We can't spill %sp, and since it is in the GPR64all
// register class, TargetInstrInfo::foldMemoryOperand() is going to try.
// To prevent that, we are going to constrain the %0 register class here.
// <rdar://problem/11522048>
if (MI.isFullCopy()) {
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
return nullptr;
if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
return nullptr;
// Handle the case where a copy is being spilled or filled but the source
// and destination register class don't match. For example:
// %0 = COPY %xzr; GPR64common:%0
// In this case we can still safely fold away the COPY and generate the
// following spill code:
// STRXui %xzr, %stack.0
// This also eliminates spilled cross register class COPYs (e.g. between x and
// d regs) of the same size. For example:
// %0 = COPY %1; GPR64:%0, FPR64:%1
// will be filled as
// LDRDui %0, fi<#0>
// instead of
// LDRXui %Temp, fi<#0>
// %0 = FMOV %Temp
if (MI.isCopy() && Ops.size() == 1 &&
// Make sure we're only folding the explicit COPY defs/uses.
(Ops[0] == 0 || Ops[0] == 1)) {
bool IsSpill = Ops[0] == 0;
bool IsFill = !IsSpill;
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock &MBB = *MI.getParent();
const MachineOperand &DstMO = MI.getOperand(0);
const MachineOperand &SrcMO = MI.getOperand(1);
Register DstReg = DstMO.getReg();
Register SrcReg = SrcMO.getReg();
// This is slightly expensive to compute for physical regs since
// getMinimalPhysRegClass is slow.
auto getRegClass = [&](unsigned Reg) {
return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
: TRI.getMinimalPhysRegClass(Reg);
if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
"Mismatched register size in non subreg COPY");
if (IsSpill)
storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
getRegClass(SrcReg), &TRI);
loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
getRegClass(DstReg), &TRI);
return &*--InsertPt;
// Handle cases like spilling def of:
// %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
// where the physical register source can be widened and stored to the full
// virtual reg destination stack slot, in this case producing:
// STRXui %xzr, %stack.0
if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
assert(SrcMO.getSubReg() == 0 &&
"Unexpected subreg on physical register");
const TargetRegisterClass *SpillRC;
unsigned SpillSubreg;
switch (DstMO.getSubReg()) {
SpillRC = nullptr;
case AArch64::sub_32:
case AArch64::ssub:
if (AArch64::GPR32RegClass.contains(SrcReg)) {
SpillRC = &AArch64::GPR64RegClass;
SpillSubreg = AArch64::sub_32;
} else if (AArch64::FPR32RegClass.contains(SrcReg)) {
SpillRC = &AArch64::FPR64RegClass;
SpillSubreg = AArch64::ssub;
} else
SpillRC = nullptr;
case AArch64::dsub:
if (AArch64::FPR64RegClass.contains(SrcReg)) {
SpillRC = &AArch64::FPR128RegClass;
SpillSubreg = AArch64::dsub;
} else
SpillRC = nullptr;
if (SpillRC)
if (unsigned WidenedSrcReg =
TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
FrameIndex, SpillRC, &TRI);
return &*--InsertPt;
// Handle cases like filling use of:
// %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
// where we can load the full virtual reg source stack slot, into the subreg
// destination, in this case producing:
// LDRWui %0:sub_32<def,read-undef>, %stack.0
if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
const TargetRegisterClass *FillRC;
switch (DstMO.getSubReg()) {
FillRC = nullptr;
case AArch64::sub_32:
FillRC = &AArch64::GPR32RegClass;
case AArch64::ssub:
FillRC = &AArch64::FPR32RegClass;
case AArch64::dsub:
FillRC = &AArch64::FPR64RegClass;
if (FillRC) {
assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
TRI.getRegSizeInBits(*FillRC) &&
"Mismatched regclass size on folded subreg COPY");
loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
MachineInstr &LoadMI = *--InsertPt;
MachineOperand &LoadDst = LoadMI.getOperand(0);
assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
return &LoadMI;
// Cannot fold.
return nullptr;
int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
StackOffset &SOffset,
bool *OutUseUnscaledOp,
unsigned *OutUnscaledOp,
int64_t *EmittableOffset) {
// Set output values in case of early exit.
if (EmittableOffset)
*EmittableOffset = 0;
if (OutUseUnscaledOp)
*OutUseUnscaledOp = false;
if (OutUnscaledOp)
*OutUnscaledOp = 0;
// Exit early for structured vector spills/fills as they can't take an
// immediate offset.
switch (MI.getOpcode()) {
case AArch64::LD1Twov2d:
case AArch64::LD1Threev2d:
case AArch64::LD1Fourv2d:
case AArch64::LD1Twov1d:
case AArch64::LD1Threev1d:
case AArch64::LD1Fourv1d:
case AArch64::ST1Twov2d:
case AArch64::ST1Threev2d:
case AArch64::ST1Fourv2d:
case AArch64::ST1Twov1d:
case AArch64::ST1Threev1d:
case AArch64::ST1Fourv1d:
case AArch64::IRG:
case AArch64::IRGstack:
case AArch64::STGloop:
case AArch64::STZGloop:
return AArch64FrameOffsetCannotUpdate;
// Get the min/max offset and the scale.
TypeSize ScaleValue(0U, false);
unsigned Width;
int64_t MinOff, MaxOff;
if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
// Construct the complete offset.
bool IsMulVL = ScaleValue.isScalable();
unsigned Scale = ScaleValue.getKnownMinSize();
int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes();
const MachineOperand &ImmOpnd =
Offset += ImmOpnd.getImm() * Scale;
// If the offset doesn't match the scale, we rewrite the instruction to
// use the unscaled instruction instead. Likewise, if we have a negative
// offset and there is an unscaled op to use.
Optional<unsigned> UnscaledOp =
bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
if (useUnscaledOp &&
!AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
Scale = ScaleValue.getKnownMinSize();
assert(IsMulVL == ScaleValue.isScalable() &&
"Unscaled opcode has different value for scalable");
int64_t Remainder = Offset % Scale;
assert(!(Remainder && useUnscaledOp) &&
"Cannot have remainder when using unscaled op");
assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
int64_t NewOffset = Offset / Scale;
if (MinOff <= NewOffset && NewOffset <= MaxOff)
Offset = Remainder;
else {
NewOffset = NewOffset < 0 ? MinOff : MaxOff;
Offset = Offset - NewOffset * Scale + Remainder;
if (EmittableOffset)
*EmittableOffset = NewOffset;
if (OutUseUnscaledOp)
*OutUseUnscaledOp = useUnscaledOp;
if (OutUnscaledOp && UnscaledOp)
*OutUnscaledOp = *UnscaledOp;
if (IsMulVL)
SOffset = StackOffset(Offset, MVT::nxv1i8) +
StackOffset(SOffset.getBytes(), MVT::i8);
SOffset = StackOffset(Offset, MVT::i8) +
StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
return AArch64FrameOffsetCanUpdate |
(SOffset ? 0 : AArch64FrameOffsetIsLegal);
bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
unsigned FrameReg, StackOffset &Offset,
const AArch64InstrInfo *TII) {
unsigned Opcode = MI.getOpcode();
unsigned ImmIdx = FrameRegIdx + 1;
if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
MI.getOperand(0).getReg(), FrameReg, Offset, TII,
MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
Offset = StackOffset();
return true;
int64_t NewOffset;
unsigned UnscaledOp;
bool UseUnscaledOp;
int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
&UnscaledOp, &NewOffset);
if (Status & AArch64FrameOffsetCanUpdate) {
if (Status & AArch64FrameOffsetIsLegal)
// Replace the FrameIndex with FrameReg.
MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
if (UseUnscaledOp)
return !Offset;
return false;
void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
// AArch64 supports MachineCombiner.
bool AArch64InstrInfo::useMachineCombiner() const { return true; }
// True when Opc sets flag
static bool isCombineInstrSettingFlag(unsigned Opc) {
switch (Opc) {
case AArch64::ADDSWrr:
case AArch64::ADDSWri:
case AArch64::ADDSXrr:
case AArch64::ADDSXri:
case AArch64::SUBSWrr:
case AArch64::SUBSXrr:
// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
case AArch64::SUBSWri:
case AArch64::SUBSXri:
return true;
return false;
// 32b Opcodes that can be combined with a MUL
static bool isCombineInstrCandidate32(unsigned Opc) {
switch (Opc) {
case AArch64::ADDWrr:
case AArch64::ADDWri:
case AArch64::SUBWrr:
case AArch64::ADDSWrr:
case AArch64::ADDSWri:
case AArch64::SUBSWrr:
// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
case AArch64::SUBWri:
case AArch64::SUBSWri:
return true;
return false;
// 64b Opcodes that can be combined with a MUL
static bool isCombineInstrCandidate64(unsigned Opc) {
switch (Opc) {
case AArch64::ADDXrr:
case AArch64::ADDXri:
case AArch64::SUBXrr:
case AArch64::ADDSXrr:
case AArch64::ADDSXri:
case AArch64::SUBSXrr:
// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
case AArch64::SUBXri:
case AArch64::SUBSXri:
case AArch64::ADDv8i8:
case AArch64::ADDv16i8:
case AArch64::ADDv4i16:
case AArch64::ADDv8i16:
case AArch64::ADDv2i32:
case AArch64::ADDv4i32:
case AArch64::SUBv8i8:
case AArch64::SUBv16i8:
case AArch64::SUBv4i16:
case AArch64::SUBv8i16:
case AArch64::SUBv2i32:
case AArch64::SUBv4i32:
return true;
return false;
// FP Opcodes that can be combined with a FMUL
static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
switch (Inst.getOpcode()) {
case AArch64::FADDHrr:
case AArch64::FADDSrr:
case AArch64::FADDDrr:
case AArch64::FADDv4f16:
case AArch64::FADDv8f16:
case AArch64::FADDv2f32:
case AArch64::FADDv2f64:
case AArch64::FADDv4f32:
case AArch64::FSUBHrr:
case AArch64::FSUBSrr:
case AArch64::FSUBDrr:
case AArch64::FSUBv4f16:
case AArch64::FSUBv8f16:
case AArch64::FSUBv2f32:
case AArch64::FSUBv2f64:
case AArch64::FSUBv4f32:
TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
return (Options.UnsafeFPMath ||
Options.AllowFPOpFusion == FPOpFusion::Fast);
return false;
// Opcodes that can be combined with a MUL
static bool isCombineInstrCandidate(unsigned Opc) {
return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
// Utility routine that checks if \param MO is defined by an
// \param CombineOpc instruction in the basic block \param MBB
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
unsigned CombineOpc, unsigned ZeroReg = 0,
bool CheckZeroReg = false) {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineInstr *MI = nullptr;
if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
MI = MRI.getUniqueVRegDef(MO.getReg());
// And it needs to be in the trace (otherwise, it won't have a depth).
if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
return false;
// Must only used by the user we combine with.
if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
return false;
if (CheckZeroReg) {
assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
// The third input reg must be zero.
if (MI->getOperand(3).getReg() != ZeroReg)
return false;
return true;
// Is \param MO defined by an integer multiply and can be combined?
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
unsigned MulOpc, unsigned ZeroReg) {
return canCombine(MBB, MO, MulOpc, ZeroReg, true);
// Is \param MO defined by a floating-point multiply and can be combined?
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
unsigned MulOpc) {
return canCombine(MBB, MO, MulOpc);
// TODO: There are many more machine instruction opcodes to match:
// 1. Other data types (integer, vectors)
// 2. Other math / logic operations (xor, or)
// 3. Other forms of the same operation (intrinsics and other variants)
bool AArch64InstrInfo::isAssociativeAndCommutative(
const MachineInstr &Inst) const {
switch (Inst.getOpcode()) {
case AArch64::FADDDrr:
case AArch64::FADDSrr:
case AArch64::FADDv2f32:
case AArch64::FADDv2f64:
case AArch64::FADDv4f32:
case AArch64::FMULDrr:
case AArch64::FMULSrr:
case AArch64::FMULX32:
case AArch64::FMULX64:
case AArch64::FMULXv2f32:
case AArch64::FMULXv2f64:
case AArch64::FMULXv4f32:
case AArch64::FMULv2f32:
case AArch64::FMULv2f64:
case AArch64::FMULv4f32:
return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
return false;
/// Find instructions that can be turned into madd.
static bool getMaddPatterns(MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) {
unsigned Opc = Root.getOpcode();
MachineBasicBlock &MBB = *Root.getParent();
bool Found = false;
if (!isCombineInstrCandidate(Opc))
return false;
if (isCombineInstrSettingFlag(Opc)) {
int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
// When NZCV is live bail out.
if (Cmp_NZCV == -1)
return false;
unsigned NewOpc = convertToNonFlagSettingOpc(Root);
// When opcode can't change bail out.
// CHECKME: do we miss any cases for opcode conversion?
if (NewOpc == Opc)
return false;
Opc = NewOpc;
auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
MachineCombinerPattern Pattern) {
if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
Found = true;
auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
Found = true;
typedef MachineCombinerPattern MCP;
switch (Opc) {
case AArch64::ADDWrr:
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
"ADDWrr does not have register operands");
setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
case AArch64::ADDXrr:
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
case AArch64::SUBWrr:
setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
case AArch64::SUBXrr:
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
case AArch64::ADDWri:
setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
case AArch64::ADDXri:
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
case AArch64::SUBWri:
setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
case AArch64::SUBXri:
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
case AArch64::ADDv8i8:
setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
case AArch64::ADDv16i8:
setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
case AArch64::ADDv4i16:
setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
case AArch64::ADDv8i16:
setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
case AArch64::ADDv2i32:
setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
case AArch64::ADDv4i32:
setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
case AArch64::SUBv8i8:
setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
case AArch64::SUBv16i8:
setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
case AArch64::SUBv4i16:
setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
case AArch64::SUBv8i16:
setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
case AArch64::SUBv2i32:
setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
case AArch64::SUBv4i32:
setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
return Found;
/// Floating-Point Support
/// Find instructions that can be turned into madd.
static bool getFMAPatterns(MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) {
if (!isCombineInstrCandidateFP(Root))
return false;
MachineBasicBlock &MBB = *Root.getParent();
bool Found = false;
auto Match = [&](int Opcode, int Operand,
MachineCombinerPattern Pattern) -> bool {
if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
return true;
return false;
typedef MachineCombinerPattern MCP;
switch (Root.getOpcode()) {
assert(false && "Unsupported FP instruction in combiner\n");
case AArch64::FADDHrr:
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
"FADDHrr does not have register operands");
Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
case AArch64::FADDSrr:
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
"FADDSrr does not have register operands");
Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
case AArch64::FADDDrr:
Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
case AArch64::FADDv4f16:
Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
case AArch64::FADDv8f16:
Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
case AArch64::FADDv2f32:
Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
case AArch64::FADDv2f64:
Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
case AArch64::FADDv4f32:
Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
case AArch64::FSUBHrr:
Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
case AArch64::FSUBSrr:
Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
case AArch64::FSUBDrr:
Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
case AArch64::FSUBv4f16:
Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
case AArch64::FSUBv8f16:
Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
case AArch64::FSUBv2f32:
Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
case AArch64::FSUBv2f64:
Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
case AArch64::FSUBv4f32:
Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
return Found;
/// Return true when a code sequence can improve throughput. It
/// should be called only for instructions in loops.
/// \param Pattern - combiner pattern
bool AArch64InstrInfo::isThroughputPattern(
MachineCombinerPattern Pattern) const {
switch (Pattern) {
case MachineCombinerPattern::FMULADDH_OP1:
case MachineCombinerPattern::FMULADDH_OP2:
case MachineCombinerPattern::FMULSUBH_OP1:
case MachineCombinerPattern::FMULSUBH_OP2:
case MachineCombinerPattern::FMULADDS_OP1:
case MachineCombinerPattern::FMULADDS_OP2:
case MachineCombinerPattern::FMULSUBS_OP1:
case MachineCombinerPattern::FMULSUBS_OP2:
case MachineCombinerPattern::FMULADDD_OP1:
case MachineCombinerPattern::FMULADDD_OP2:
case MachineCombinerPattern::FMULSUBD_OP1:
case MachineCombinerPattern::FMULSUBD_OP2:
case MachineCombinerPattern::FNMULSUBH_OP1:
case MachineCombinerPattern::FNMULSUBS_OP1:
case MachineCombinerPattern::FNMULSUBD_OP1:
case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
case MachineCombinerPattern::FMLAv4f16_OP2:
case MachineCombinerPattern::FMLAv4f16_OP1:
case MachineCombinerPattern::FMLAv8f16_OP1:
case MachineCombinerPattern::FMLAv8f16_OP2:
case MachineCombinerPattern::FMLAv2f32_OP2:
case MachineCombinerPattern::FMLAv2f32_OP1:
case MachineCombinerPattern::FMLAv2f64_OP1:
case MachineCombinerPattern::FMLAv2f64_OP2:
case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
case MachineCombinerPattern::FMLAv4f32_OP1:
case MachineCombinerPattern::FMLAv4f32_OP2:
case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
case MachineCombinerPattern::FMLSv4f16_OP1:
case MachineCombinerPattern::FMLSv4f16_OP2:
case MachineCombinerPattern::FMLSv8f16_OP1:
case MachineCombinerPattern::FMLSv8f16_OP2:
case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
case MachineCombinerPattern::FMLSv4f32_OP2:
case MachineCombinerPattern::MULADDv8i8_OP1:
case MachineCombinerPattern::MULADDv8i8_OP2:
case MachineCombinerPattern::MULADDv16i8_OP1:
case MachineCombinerPattern::MULADDv16i8_OP2:
case MachineCombinerPattern::MULADDv4i16_OP1:
case MachineCombinerPattern::MULADDv4i16_OP2:
case MachineCombinerPattern::MULADDv8i16_OP1:
case MachineCombinerPattern::MULADDv8i16_OP2:
case MachineCombinerPattern::MULADDv2i32_OP1:
case MachineCombinerPattern::MULADDv2i32_OP2:
case MachineCombinerPattern::MULADDv4i32_OP1:
case MachineCombinerPattern::MULADDv4i32_OP2:
case MachineCombinerPattern::MULSUBv8i8_OP1:
case MachineCombinerPattern::MULSUBv8i8_OP2:
case MachineCombinerPattern::MULSUBv16i8_OP1:
case MachineCombinerPattern::MULSUBv16i8_OP2:
case MachineCombinerPattern::MULSUBv4i16_OP1:
case MachineCombinerPattern::MULSUBv4i16_OP2:
case MachineCombinerPattern::MULSUBv8i16_OP1:
case MachineCombinerPattern::MULSUBv8i16_OP2:
case MachineCombinerPattern::MULSUBv2i32_OP1:
case MachineCombinerPattern::MULSUBv2i32_OP2:
case MachineCombinerPattern::MULSUBv4i32_OP1:
case MachineCombinerPattern::MULSUBv4i32_OP2:
case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
return true;
} // end switch (Pattern)
return false;
/// Return true when there is potentially a faster code sequence for an
/// instruction chain ending in \p Root. All potential patterns are listed in
/// the \p Pattern vector. Pattern should be sorted in priority order since the
/// pattern evaluator stops checking as soon as it finds a faster sequence.
bool AArch64InstrInfo::getMachineCombinerPatterns(
MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
// Integer patterns
if (getMaddPatterns(Root, Patterns))
return true;
// Floating point patterns
if (getFMAPatterns(Root, Patterns))
return true;
return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
enum class FMAInstKind { Default, Indexed, Accumulator };
/// genFusedMultiply - Generate fused multiply instructions.
/// This function supports both integer and floating point instructions.
/// A typical example:
/// F|MUL I=A,B,0
/// F|ADD R,I,C
/// ==> F|MADD R,A,B,C
/// \param MF Containing MachineFunction
/// \param MRI Register information
/// \param TII Target information
/// \param Root is the F|ADD instruction
/// \param [out] InsInstrs is a vector of machine instructions and will
/// contain the generated madd instruction
/// \param IdxMulOpd is index of operand in Root that is the result of
/// the F|MUL. In the example above IdxMulOpd is 1.
/// \param MaddOpc the opcode fo the f|madd instruction
/// \param RC Register class of operands
/// \param kind of fma instruction (addressing mode) to be generated
/// \param ReplacedAddend is the result register from the instruction
/// replacing the non-combined operand, if any.
static MachineInstr *
genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
const TargetInstrInfo *TII, MachineInstr &Root,
SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
unsigned MaddOpc, const TargetRegisterClass *RC,
FMAInstKind kind = FMAInstKind::Default,
const Register *ReplacedAddend = nullptr) {
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
Register ResultReg = Root.getOperand(0).getReg();
Register SrcReg0 = MUL->getOperand(1).getReg();
bool Src0IsKill = MUL->getOperand(1).isKill();
Register SrcReg1 = MUL->getOperand(2).getReg();
bool Src1IsKill = MUL->getOperand(2).isKill();
unsigned SrcReg2;
bool Src2IsKill;
if (ReplacedAddend) {
// If we just generated a new addend, we must be it's only use.
SrcReg2 = *ReplacedAddend;
Src2IsKill = true;
} else {
SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
if (Register::isVirtualRegister(ResultReg))
MRI.constrainRegClass(ResultReg, RC);
if (Register::isVirtualRegister(SrcReg0))
MRI.constrainRegClass(SrcReg0, RC);
if (Register::isVirtualRegister(SrcReg1))
MRI.constrainRegClass(SrcReg1, RC);
if (Register::isVirtualRegister(SrcReg2))
MRI.constrainRegClass(SrcReg2, RC);
MachineInstrBuilder MIB;
if (kind == FMAInstKind::Default)
MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
.addReg(SrcReg0, getKillRegState(Src0IsKill))
.addReg(SrcReg1, getKillRegState(Src1IsKill))
.addReg(SrcReg2, getKillRegState(Src2IsKill));
else if (kind == FMAInstKind::Indexed)
MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
.addReg(SrcReg2, getKillRegState(Src2IsKill))
.addReg(SrcReg0, getKillRegState(Src0IsKill))
.addReg(SrcReg1, getKillRegState(Src1IsKill))
else if (kind == FMAInstKind::Accumulator)
MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
.addReg(SrcReg2, getKillRegState(Src2IsKill))
.addReg(SrcReg0, getKillRegState(Src0IsKill))
.addReg(SrcReg1, getKillRegState(Src1IsKill));
assert(false && "Invalid FMA instruction kind \n");
// Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
return MUL;
/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
/// instructions.
/// \see genFusedMultiply
static MachineInstr *genFusedMultiplyAcc(
MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
/// genNeg - Helper to generate an intermediate negation of the second operand
/// of Root
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
const TargetInstrInfo *TII, MachineInstr &Root,
SmallVectorImpl<MachineInstr *> &InsInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
unsigned MnegOpc, const TargetRegisterClass *RC) {
Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB =
BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
return NewVR;
/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
/// instructions with an additional negation of the accumulator
static MachineInstr *genFusedMultiplyAccNeg(
MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
assert(IdxMulOpd == 1);
Register NewVR =
genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
FMAInstKind::Accumulator, &NewVR);
/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
/// instructions.
/// \see genFusedMultiply
static MachineInstr *genFusedMultiplyIdx(
MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
/// instructions with an additional negation of the accumulator
static MachineInstr *genFusedMultiplyIdxNeg(
MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
assert(IdxMulOpd == 1);
Register NewVR =
genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
FMAInstKind::Indexed, &NewVR);
/// genMaddR - Generate madd instruction and combine mul and add using
/// an extra virtual register
/// Example - an ADD intermediate needs to be stored in a register:
/// MUL I=A,B,0
/// ADD R,I,Imm
/// ==> ORR V, ZR, Imm
/// ==> MADD R,A,B,V
/// \param MF Containing MachineFunction
/// \param MRI Register information
/// \param TII Target information
/// \param Root is the ADD instruction
/// \param [out] InsInstrs is a vector of machine instructions and will
/// contain the generated madd instruction
/// \param IdxMulOpd is index of operand in Root that is the result of
/// the MUL. In the example above IdxMulOpd is 1.
/// \param MaddOpc the opcode fo the madd instruction
/// \param VR is a virtual register that holds the value of an ADD operand
/// (V in the example above).
/// \param RC Register class of operands
static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
const TargetInstrInfo *TII, MachineInstr &Root,
SmallVectorImpl<MachineInstr *> &InsInstrs,
unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
const TargetRegisterClass *RC) {
assert(IdxMulOpd == 1 || IdxMulOpd == 2);
MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
Register ResultReg = Root.getOperand(0).getReg();
Register SrcReg0 = MUL->getOperand(1).getReg();
bool Src0IsKill = MUL->getOperand(1).isKill();
Register SrcReg1 = MUL->getOperand(2).getReg();
bool Src1IsKill = MUL->getOperand(2).isKill();
if (Register::isVirtualRegister(ResultReg))
MRI.constrainRegClass(ResultReg, RC);
if (Register::isVirtualRegister(SrcReg0))
MRI.constrainRegClass(SrcReg0, RC);
if (Register::isVirtualRegister(SrcReg1))
MRI.constrainRegClass(SrcReg1, RC);
if (Register::isVirtualRegister(VR))
MRI.constrainRegClass(VR, RC);
MachineInstrBuilder MIB =
BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
.addReg(SrcReg0, getKillRegState(Src0IsKill))
.addReg(SrcReg1, getKillRegState(Src1IsKill))
// Insert the MADD
return MUL;
/// When getMachineCombinerPatterns() finds potential patterns,
/// this function generates the instructions that could replace the
/// original code sequence
void AArch64InstrInfo::genAlternativeCodeSequence(
MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
MachineBasicBlock &MBB = *Root.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
MachineInstr *MUL;
const TargetRegisterClass *RC;
unsigned Opc;
switch (Pattern) {
// Reassociate instructions.
TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
DelInstrs, InstrIdxForVirtReg);
case MachineCombinerPattern::MULADDW_OP1:
case MachineCombinerPattern::MULADDX_OP1:
// MUL I=A,B,0
// ADD R,I,C
// ==> MADD R,A,B,C
// --- Create(MADD);
if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDW_OP2:
case MachineCombinerPattern::MULADDX_OP2:
// MUL I=A,B,0
// ADD R,C,I
// ==> MADD R,A,B,C
// --- Create(MADD);
if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDWI_OP1:
case MachineCombinerPattern::MULADDXI_OP1: {
// MUL I=A,B,0
// ADD R,I,Imm
// ==> ORR V, ZR, Imm
// ==> MADD R,A,B,V
// --- Create(MADD);
const TargetRegisterClass *OrrRC;
unsigned BitSize, OrrOpc, ZeroReg;
if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
OrrOpc = AArch64::ORRWri;
OrrRC = &AArch64::GPR32spRegClass;
BitSize = 32;
ZeroReg = AArch64::WZR;
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
OrrOpc = AArch64::ORRXri;
OrrRC = &AArch64::GPR64spRegClass;
BitSize = 64;
ZeroReg = AArch64::XZR;
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
Register NewVR = MRI.createVirtualRegister(OrrRC);
uint64_t Imm = Root.getOperand(2).getImm();
if (Root.getOperand(3).isImm()) {
unsigned Val = Root.getOperand(3).getImm();
Imm = Imm << Val;
uint64_t UImm = SignExtend64(Imm, BitSize);
uint64_t Encoding;
if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
case MachineCombinerPattern::MULSUBW_OP1:
case MachineCombinerPattern::MULSUBX_OP1: {
// MUL I=A,B,0
// SUB R,I, C
// ==> SUB V, 0, C
// ==> MADD R,A,B,V // = -C + A*B
// --- Create(MADD);
const TargetRegisterClass *SubRC;
unsigned SubOpc, ZeroReg;
if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
SubOpc = AArch64::SUBWrr;
SubRC = &AArch64::GPR32spRegClass;
ZeroReg = AArch64::WZR;
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
SubOpc = AArch64::SUBXrr;
SubRC = &AArch64::GPR64spRegClass;
ZeroReg = AArch64::XZR;
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
Register NewVR = MRI.createVirtualRegister(SubRC);
// SUB NewVR, 0, C
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
case MachineCombinerPattern::MULSUBW_OP2:
case MachineCombinerPattern::MULSUBX_OP2:
// MUL I=A,B,0
// SUB R,C,I
// ==> MSUB R,A,B,C (computes C - A*B)
// --- Create(MSUB);
if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
Opc = AArch64::MSUBWrrr;
RC = &AArch64::GPR32RegClass;
} else {
Opc = AArch64::MSUBXrrr;
RC = &AArch64::GPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBWI_OP1:
case MachineCombinerPattern::MULSUBXI_OP1: {
// MUL I=A,B,0
// SUB R,I, Imm
// ==> ORR V, ZR, -Imm
// ==> MADD R,A,B,V // = -Imm + A*B
// --- Create(MADD);
const TargetRegisterClass *OrrRC;
unsigned BitSize, OrrOpc, ZeroReg;
if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
OrrOpc = AArch64::ORRWri;
OrrRC = &AArch64::GPR32spRegClass;
BitSize = 32;
ZeroReg = AArch64::WZR;
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
OrrOpc = AArch64::ORRXri;
OrrRC = &AArch64::GPR64spRegClass;
BitSize = 64;
ZeroReg = AArch64::XZR;
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
Register NewVR = MRI.createVirtualRegister(OrrRC);
uint64_t Imm = Root.getOperand(2).getImm();
if (Root.getOperand(3).isImm()) {
unsigned Val = Root.getOperand(3).getImm();
Imm = Imm << Val;
uint64_t UImm = SignExtend64(-Imm, BitSize);
uint64_t Encoding;
if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
case MachineCombinerPattern::MULADDv8i8_OP1:
Opc = AArch64::MLAv8i8;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv8i8_OP2:
Opc = AArch64::MLAv8i8;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv16i8_OP1:
Opc = AArch64::MLAv16i8;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv16i8_OP2:
Opc = AArch64::MLAv16i8;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv4i16_OP1:
Opc = AArch64::MLAv4i16;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv4i16_OP2:
Opc = AArch64::MLAv4i16;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv8i16_OP1:
Opc = AArch64::MLAv8i16;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv8i16_OP2:
Opc = AArch64::MLAv8i16;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv2i32_OP1:
Opc = AArch64::MLAv2i32;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv2i32_OP2:
Opc = AArch64::MLAv2i32;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv4i32_OP1:
Opc = AArch64::MLAv4i32;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv4i32_OP2:
Opc = AArch64::MLAv4i32;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv8i8_OP1:
Opc = AArch64::MLAv8i8;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
case MachineCombinerPattern::MULSUBv8i8_OP2:
Opc = AArch64::MLSv8i8;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv16i8_OP1:
Opc = AArch64::MLAv16i8;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
case MachineCombinerPattern::MULSUBv16i8_OP2:
Opc = AArch64::MLSv16i8;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv4i16_OP1:
Opc = AArch64::MLAv4i16;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
case MachineCombinerPattern::MULSUBv4i16_OP2:
Opc = AArch64::MLSv4i16;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv8i16_OP1:
Opc = AArch64::MLAv8i16;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
case MachineCombinerPattern::MULSUBv8i16_OP2:
Opc = AArch64::MLSv8i16;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv2i32_OP1:
Opc = AArch64::MLAv2i32;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
case MachineCombinerPattern::MULSUBv2i32_OP2:
Opc = AArch64::MLSv2i32;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv4i32_OP1:
Opc = AArch64::MLAv4i32;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
case MachineCombinerPattern::MULSUBv4i32_OP2:
Opc = AArch64::MLSv4i32;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
Opc = AArch64::MLAv4i16_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
Opc = AArch64::MLAv4i16_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
Opc = AArch64::MLAv8i16_indexed;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
Opc = AArch64::MLAv8i16_indexed;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
Opc = AArch64::MLAv2i32_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
Opc = AArch64::MLAv2i32_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
Opc = AArch64::MLAv4i32_indexed;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
Opc = AArch64::MLAv4i32_indexed;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
Opc = AArch64::MLAv4i16_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
Opc = AArch64::MLSv4i16_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
Opc = AArch64::MLAv8i16_indexed;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
Opc = AArch64::MLSv8i16_indexed;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
Opc = AArch64::MLAv2i32_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
Opc = AArch64::MLSv2i32_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
Opc = AArch64::MLAv4i32_indexed;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
Opc = AArch64::MLSv4i32_indexed;
RC = &AArch64::FPR128RegClass;
MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
// Floating Point Support
case MachineCombinerPattern::FMULADDH_OP1:
Opc = AArch64::FMADDHrrr;
RC = &AArch64::FPR16RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FMULADDS_OP1:
Opc = AArch64::FMADDSrrr;
RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FMULADDD_OP1:
Opc = AArch64::FMADDDrrr;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FMULADDH_OP2:
Opc = AArch64::FMADDHrrr;
RC = &AArch64::FPR16RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::FMULADDS_OP2:
Opc = AArch64::FMADDSrrr;
RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::FMULADDD_OP2:
Opc = AArch64::FMADDDrrr;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
Opc = AArch64::FMLAv1i32_indexed;
RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
Opc = AArch64::FMLAv1i32_indexed;
RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
Opc = AArch64::FMLAv1i64_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
Opc = AArch64::FMLAv1i64_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
RC = &AArch64::FPR64RegClass;
Opc = AArch64::FMLAv4i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv4f16_OP1:
RC = &AArch64::FPR64RegClass;
Opc = AArch64::FMLAv4f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
RC = &AArch64::FPR64RegClass;
Opc = AArch64::FMLAv4i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLAv4f16_OP2:
RC = &AArch64::FPR64RegClass;
Opc = AArch64::FMLAv4f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
case MachineCombinerPattern::FMLAv2f32_OP1:
RC = &AArch64::FPR64RegClass;
if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
Opc = AArch64::FMLAv2i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
} else {
Opc = AArch64::FMLAv2f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
case MachineCombinerPattern::FMLAv2f32_OP2:
RC = &AArch64::FPR64RegClass;
if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
Opc = AArch64::FMLAv2i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
} else {
Opc = AArch64::FMLAv2f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLAv8i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv8f16_OP1:
RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLAv8f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLAv8i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLAv8f16_OP2:
RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLAv8f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
case MachineCombinerPattern::FMLAv2f64_OP1:
RC = &AArch64::FPR128RegClass;
if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
Opc = AArch64::FMLAv2i64_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
} else {
Opc = AArch64::FMLAv2f64;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
case MachineCombinerPattern::FMLAv2f64_OP2:
RC = &AArch64::FPR128RegClass;
if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
Opc = AArch64::FMLAv2i64_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
} else {
Opc = AArch64::FMLAv2f64;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
case MachineCombinerPattern::FMLAv4f32_OP1:
RC = &AArch64::FPR128RegClass;
if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
Opc = AArch64::FMLAv4i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
} else {
Opc = AArch64::FMLAv4f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
case MachineCombinerPattern::FMLAv4f32_OP2:
RC = &AArch64::FPR128RegClass;
if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
Opc = AArch64::FMLAv4i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
} else {
Opc = AArch64::FMLAv4f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMULSUBH_OP1:
Opc = AArch64::FNMSUBHrrr;
RC = &AArch64::FPR16RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FMULSUBS_OP1:
Opc = AArch64::FNMSUBSrrr;
RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FMULSUBD_OP1:
Opc = AArch64::FNMSUBDrrr;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FNMULSUBH_OP1:
Opc = AArch64::FNMADDHrrr;
RC = &AArch64::FPR16RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FNMULSUBS_OP1:
Opc = AArch64::FNMADDSrrr;
RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FNMULSUBD_OP1:
Opc = AArch64::FNMADDDrrr;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
case MachineCombinerPattern::FMULSUBH_OP2:
Opc = AArch64::FMSUBHrrr;
RC = &AArch64::FPR16RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::FMULSUBS_OP2:
Opc = AArch64::FMSUBSrrr;
RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::FMULSUBD_OP2:
Opc = AArch64::FMSUBDrrr;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
Opc = AArch64::FMLSv1i32_indexed;
RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
Opc = AArch64::FMLSv1i64_indexed;
RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv4f16_OP1:
case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
RC = &AArch64::FPR64RegClass;
Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
Opc = AArch64::FMLAv4f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Accumulator, &NewVR);
} else {
Opc = AArch64::FMLAv4i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed, &NewVR);
case MachineCombinerPattern::FMLSv4f16_OP2:
RC = &AArch64::FPR64RegClass;
Opc = AArch64::FMLSv4f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
RC = &AArch64::FPR64RegClass;
Opc = AArch64::FMLSv4i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
RC = &AArch64::FPR64RegClass;
if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
Opc = AArch64::FMLSv2i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
} else {
Opc = AArch64::FMLSv2f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv8f16_OP1:
case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
RC = &AArch64::FPR128RegClass;
Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
Opc = AArch64::FMLAv8f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Accumulator, &NewVR);
} else {
Opc = AArch64::FMLAv8i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed, &NewVR);
case MachineCombinerPattern::FMLSv8f16_OP2:
RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLSv8f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLSv8i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
RC = &AArch64::FPR128RegClass;
if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
Opc = AArch64::FMLSv2i64_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
} else {
Opc = AArch64::FMLSv2f64;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv4f32_OP2:
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
RC = &AArch64::FPR128RegClass;
if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
Opc = AArch64::FMLSv4i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
} else {
Opc = AArch64::FMLSv4f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
case MachineCombinerPattern::FMLSv2f32_OP1:
case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
RC = &AArch64::FPR64RegClass;
Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
Opc = AArch64::FMLAv2i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed, &NewVR);
} else {
Opc = AArch64::FMLAv2f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Accumulator, &NewVR);
case MachineCombinerPattern::FMLSv4f32_OP1:
case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
RC = &AArch64::FPR128RegClass;
Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
Opc = AArch64::FMLAv4i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed, &NewVR);
} else {
Opc = AArch64::FMLAv4f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Accumulator, &NewVR);
case MachineCombinerPattern::FMLSv2f64_OP1:
case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
RC = &AArch64::FPR128RegClass;
Register NewVR = MRI.createVirtualRegister(RC);
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
Opc = AArch64::FMLAv2i64_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed, &NewVR);
} else {
Opc = AArch64::FMLAv2f64;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Accumulator, &NewVR);
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
/// Replace csincr-branch sequence by simple conditional branch
/// Examples:
/// 1. \code
/// csinc w9, wzr, wzr, <condition code>
/// tbnz w9, #0, 0x44
/// \endcode
/// to
/// \code
/// b.<inverted condition code>
/// \endcode
/// 2. \code
/// csinc w9, wzr, wzr, <condition code>
/// tbz w9, #0, 0x44
/// \endcode
/// to
/// \code
/// b.<condition code>
/// \endcode
/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
/// compare's constant operand is power of 2.
/// Examples:
/// \code
/// and w8, w8, #0x400
/// cbnz w8, L1
/// \endcode
/// to
/// \code
/// tbnz w8, #10, L1
/// \endcode
/// \param MI Conditional Branch
/// \return True when the simple conditional branch is generated
bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
bool IsNegativeBranch = false;
bool IsTestAndBranch = false;
unsigned TargetBBInMI = 0;
switch (MI.getOpcode()) {
llvm_unreachable("Unknown branch instruction?");
case AArch64::Bcc:
return false;
case AArch64::CBZW:
case AArch64::CBZX:
TargetBBInMI = 1;
case AArch64::CBNZW:
case AArch64::CBNZX:
TargetBBInMI = 1;
IsNegativeBranch = true;
case AArch64::TBZW:
case AArch64::TBZX:
TargetBBInMI = 2;
IsTestAndBranch = true;
case AArch64::TBNZW:
case AArch64::TBNZX:
TargetBBInMI = 2;
IsNegativeBranch = true;
IsTestAndBranch = true;
// So we increment a zero register and test for bits other
// than bit 0? Conservatively bail out in case the verifier
// missed this case.
if (IsTestAndBranch && MI.getOperand(1).getImm())
return false;
// Find Definition.
assert(MI.getParent() && "Incomplete machine instruciton\n");
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
Register VReg = MI.getOperand(0).getReg();
if (!Register::isVirtualRegister(VReg))
return false;
MachineInstr *DefMI = MRI->getVRegDef(VReg);
// Look through COPY instructions to find definition.
while (DefMI->isCopy()) {
Register CopyVReg = DefMI->getOperand(1).getReg();
if (!MRI->hasOneNonDBGUse(CopyVReg))
return false;
if (!MRI->hasOneDef(CopyVReg))
return false;
DefMI = MRI->getVRegDef(CopyVReg);
switch (DefMI->getOpcode()) {
return false;
// Fold AND into a TBZ/TBNZ if constant operand is power of 2.
case AArch64::ANDWri:
case AArch64::ANDXri: {
if (IsTestAndBranch)
return false;
if (DefMI->getParent() != MBB)
return false;
if (!MRI->hasOneNonDBGUse(VReg))
return false;
bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
if (!isPowerOf2_64(Mask))
return false;
MachineOperand &MO = DefMI->getOperand(1);
Register NewReg = MO.getReg();
if (!Register::isVirtualRegister(NewReg))
return false;
assert(!MRI->def_empty(NewReg) && "Register must be defined.");
MachineBasicBlock &RefToMBB = *MBB;
MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
DebugLoc DL = MI.getDebugLoc();
unsigned Imm = Log2_64(Mask);
unsigned Opc = (Imm < 32)
? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
: (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
// Register lives on to the CBZ now.
// For immediate smaller than 32, we need to use the 32-bit
// variant (W) in all cases. Indeed the 64-bit variant does not
// allow to encode them.
// Therefore, if the input register is 64-bit, we need to take the
// 32-bit sub-part.
if (!Is32Bit && Imm < 32)
return true;
// Look for CSINC
case AArch64::CSINCWr:
case AArch64::CSINCXr: {
if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
DefMI->getOperand(2).getReg() == AArch64::WZR) &&
!(DefMI->getOperand(1).getReg() == AArch64::XZR &&
DefMI->getOperand(2).getReg() == AArch64::XZR))
return false;
if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
return false;
AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
// Convert only when the condition code is not modified between
// the CSINC and the branch. The CC may be used by other
// instructions in between.
if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
return false;
MachineBasicBlock &RefToMBB = *MBB;
MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
DebugLoc DL = MI.getDebugLoc();
if (IsNegativeBranch)
CC = AArch64CC::getInvertedCondCode(CC);
BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
return true;
std::pair<unsigned, unsigned>
AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
const unsigned Mask = AArch64II::MO_FRAGMENT;
return std::make_pair(TF & Mask, TF & ~Mask);
ArrayRef<std::pair<unsigned, const char *>>
AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
using namespace AArch64II;
static const std::pair<unsigned, const char *> TargetFlags[] = {
{MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
{MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
{MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
{MO_HI12, "aarch64-hi12"}};
return makeArrayRef(TargetFlags);
ArrayRef<std::pair<unsigned, const char *>>
AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
using namespace AArch64II;
static const std::pair<unsigned, const char *> TargetFlags[] = {
{MO_COFFSTUB, "aarch64-coffstub"},
{MO_GOT, "aarch64-got"},
{MO_NC, "aarch64-nc"},
{MO_S, "aarch64-s"},
{MO_TLS, "aarch64-tls"},
{MO_DLLIMPORT, "aarch64-dllimport"},
{MO_PREL, "aarch64-prel"},
{MO_TAGGED, "aarch64-tagged"}};
return makeArrayRef(TargetFlags);
ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
{{MOSuppressPair, "aarch64-suppress-pair"},
{MOStridedAccess, "aarch64-strided-access"}};
return makeArrayRef(TargetFlags);
/// Constants defining how certain sequences should be outlined.
/// This encompasses how an outlined function should be called, and what kind of
/// frame should be emitted for that outlined function.
/// \p MachineOutlinerDefault implies that the function should be called with
/// a save and restore of LR to the stack.
/// That is,
/// I3 Restore LR I2
/// I3
/// RET
/// * Call construction overhead: 3 (save + BL + restore)
/// * Frame construction overhead: 1 (ret)
/// * Requires stack fixups? Yes
/// \p MachineOutlinerTailCall implies that the function is being created from
/// a sequence of instructions ending in a return.
/// That is,
/// RET I2
/// RET
/// * Call construction overhead: 1 (B)
/// * Frame construction overhead: 0 (Return included in sequence)
/// * Requires stack fixups? No
/// \p MachineOutlinerNoLRSave implies that the function should be called using
/// a BL instruction, but doesn't require LR to be saved and restored. This
/// happens when LR is known to be dead.
/// That is,
/// I3 I2
/// I3
/// RET
/// * Call construction overhead: 1 (BL)
/// * Frame construction overhead: 1 (RET)
/// * Requires stack fixups? No
/// \p MachineOutlinerThunk implies that the function is being created from
/// a sequence of instructions ending in a call. The outlined function is
/// called with a BL instruction, and the outlined function tail-calls the
/// original call destination.
/// That is,
/// BL f I2
/// B f
/// * Call construction overhead: 1 (BL)
/// * Frame construction overhead: 0
/// * Requires stack fixups? No
/// \p MachineOutlinerRegSave implies that the function should be called with a
/// save and restore of LR to an available register. This allows us to avoid
/// stack fixups. Note that this outlining variant is compatible with the
/// NoLRSave case.
/// That is,
/// I3 Restore LR I2
/// I3
/// RET
/// * Call construction overhead: 3 (save + BL + restore)
/// * Frame construction overhead: 1 (ret)
/// * Requires stack fixups? No
enum MachineOutlinerClass {
MachineOutlinerDefault, /// Emit a save, restore, call, and return.
MachineOutlinerTailCall, /// Only emit a branch.
MachineOutlinerNoLRSave, /// Emit a call and return.
MachineOutlinerThunk, /// Emit a call and tail-call.
MachineOutlinerRegSave /// Same as default, but save to a register.
enum MachineOutlinerMBBFlags {
LRUnavailableSomewhere = 0x2,
HasCalls = 0x4,
UnsafeRegsDead = 0x8
AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
assert(C.LRUWasSet && "LRU wasn't set?");
MachineFunction *MF = C.getMF();
const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
// Check if there is an available register across the sequence that we can
// use.
for (unsigned Reg : AArch64::GPR64RegClass) {
if (!ARI->isReservedReg(*MF, Reg) &&
Reg != AArch64::LR && // LR is not reserved, but don't use it.
Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
Reg != AArch64::X17 && // Ditto for X17.
C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
return Reg;
// No suitable register. Return 0.
return 0u;
static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
const outliner::Candidate &b) {
const Function &Fa = a.getMF()->getFunction();
const Function &Fb = b.getMF()->getFunction();
// If none of the functions have the "sign-return-address" attribute their
// signing behaviour is equal
if (!Fa.hasFnAttribute("sign-return-address") &&
!Fb.hasFnAttribute("sign-return-address")) {
return true;
// If both functions have the "sign-return-address" attribute their signing
// behaviour is equal, if the values of the attributes are equal
if (Fa.hasFnAttribute("sign-return-address") &&
Fb.hasFnAttribute("sign-return-address")) {
StringRef ScopeA =
StringRef ScopeB =
return ScopeA.equals(ScopeB);
// If function B doesn't have the "sign-return-address" attribute but A does,
// the functions' signing behaviour is equal if A's value for
// "sign-return-address" is "none" and vice versa.
if (Fa.hasFnAttribute("sign-return-address")) {
StringRef ScopeA =
return ScopeA.equals("none");
if (Fb.hasFnAttribute("sign-return-address")) {
StringRef ScopeB =
return ScopeB.equals("none");
llvm_unreachable("Unkown combination of sign-return-address attributes");
static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
const outliner::Candidate &b) {
const Function &Fa = a.getMF()->getFunction();
const Function &Fb = b.getMF()->getFunction();
// If none of the functions have the "sign-return-address-key" attribute
// their keys are equal
if (!Fa.hasFnAttribute("sign-return-address-key") &&
!Fb.hasFnAttribute("sign-return-address-key")) {
return true;
// If both functions have the "sign-return-address-key" attribute their
// keys are equal if the values of "sign-return-address-key" are equal
if (Fa.hasFnAttribute("sign-return-address-key") &&
Fb.hasFnAttribute("sign-return-address-key")) {
StringRef KeyA =
StringRef KeyB =
return KeyA.equals(KeyB);
// If B doesn't have the "sign-return-address-key" attribute, both keys are
// equal, if function a has the default key (a_key)
if (Fa.hasFnAttribute("sign-return-address-key")) {
StringRef KeyA =
return KeyA.equals_lower("a_key");
if (Fb.hasFnAttribute("sign-return-address-key")) {
StringRef KeyB =
return KeyB.equals_lower("a_key");
llvm_unreachable("Unkown combination of sign-return-address-key attributes");
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
const outliner::Candidate &b) {
const AArch64Subtarget &SubtargetA =
const AArch64Subtarget &SubtargetB =
return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
unsigned SequenceSize =
std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
[this](unsigned Sum, const MachineInstr &MI) {
return Sum + getInstSizeInBytes(MI);
unsigned NumBytesToCreateFrame = 0;
// We only allow outlining for functions having exactly matching return
// address signing attributes, i.e., all share the same value for the
// attribute "sign-return-address" and all share the same type of key they
// are signed with.
// Additionally we require all functions to simultaniously either support
// v8.3a features or not. Otherwise an outlined function could get signed
// using dedicated v8.3 instructions and a call from a function that doesn't
// support v8.3 instructions would therefore be invalid.
if (std::adjacent_find(
RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
[](const outliner::Candidate &a, const outliner::Candidate &b) {
// Return true if a and b are non-equal w.r.t. return address
// signing or support of v8.3a features
if (outliningCandidatesSigningScopeConsensus(a, b) &&
outliningCandidatesSigningKeyConsensus(a, b) &&
outliningCandidatesV8_3OpsConsensus(a, b)) {
return false;
return true;
}) != RepeatedSequenceLocs.end()) {
return outliner::OutlinedFunction();
// Since at this point all candidates agree on their return address signing
// picking just one is fine. If the candidate functions potentially sign their
// return addresses, the outlined function should do the same. Note that in
// the case of "sign-return-address"="non-leaf" this is an assumption: It is
// not certainly true that the outlined function will have to sign its return
// address but this decision is made later, when the decision to outline
// has already been made.
// The same holds for the number of additional instructions we need: On
// v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
// necessary. However, at this point we don't know if the outlined function
// will have a RET instruction so we assume the worst.
const Function &FCF = FirstCand.getMF()->getFunction();
const TargetRegisterInfo &TRI = getRegisterInfo();
if (FCF.hasFnAttribute("sign-return-address")) {
// One PAC and one AUT instructions
NumBytesToCreateFrame += 8;
// We have to check if sp modifying instructions would get outlined.
// If so we only allow outlining if sp is unchanged overall, so matching
// sub and add instructions are okay to outline, all other sp modifications
// are not
auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
int SPValue = 0;
MachineBasicBlock::iterator MBBI = C.front();
for (;;) {
if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
switch (MBBI->getOpcode()) {
case AArch64::ADDXri:
case AArch64::ADDWri:
assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
assert(MBBI->getOperand(2).isImm() &&
"Expected operand to be immediate");
assert(MBBI->getOperand(1).isReg() &&
"Expected operand to be a register");
// Check if the add just increments sp. If so, we search for
// matching sub instructions that decrement sp. If not, the
// modification is illegal
if (MBBI->getOperand(1).getReg() == AArch64::SP)
SPValue += MBBI->getOperand(2).getImm();
return true;
case AArch64::SUBXri:
case AArch64::SUBWri:
assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
assert(MBBI->getOperand(2).isImm() &&
"Expected operand to be immediate");
assert(MBBI->getOperand(1).isReg() &&
"Expected operand to be a register");
// Check if the sub just decrements sp. If so, we search for
// matching add instructions that increment sp. If not, the
// modification is illegal
if (MBBI->getOperand(1).getReg() == AArch64::SP)
SPValue -= MBBI->getOperand(2).getImm();
return true;
return true;
if (MBBI == C.back())
if (SPValue)
return true;
return false;
// Remove candidates with illegal stack modifying instructions
// If the sequence doesn't have enough candidates left, then we're done.
if (RepeatedSequenceLocs.size() < 2)
return outliner::OutlinedFunction();
// Properties about candidate MBBs that hold for all of them.
unsigned FlagsSetInAll = 0xF;
// Compute liveness information for each candidate, and set FlagsSetInAll.
std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
[&FlagsSetInAll](outliner::Candidate &C) {
FlagsSetInAll &= C.Flags;
// According to the AArch64 Procedure Call Standard, the following are
// undefined on entry/exit from a function call:
// * Registers x16, x17, (and thus w16, w17)
// * Condition codes (and thus the NZCV register)
// Because if this, we can't outline any sequence of instructions where
// one
// of these registers is live into/across it. Thus, we need to delete
// those
// candidates.
auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
// If the unsafe registers in this block are all dead, then we don't need
// to compute liveness here.
if (C.Flags & UnsafeRegsDead)
return false;
LiveRegUnits LRU = C.LRU;
return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
// Are there any candidates where those registers are live?
if (!(FlagsSetInAll & UnsafeRegsDead)) {
// Erase every candidate that violates the restrictions above. (It could be
// true that we have viable candidates, so it's not worth bailing out in
// the case that, say, 1 out of 20 candidates violate the restructions.)
// If the sequence doesn't have enough candidates left, then we're done.
if (RepeatedSequenceLocs.size() < 2)
return outliner::OutlinedFunction();
// At this point, we have only "safe" candidates to outline. Figure out
// frame + call instruction information.
unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
// Helper lambda which sets call information for every candidate.
auto SetCandidateCallInfo =
[&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
for (outliner::Candidate &C : RepeatedSequenceLocs)
C.setCallInfo(CallID, NumBytesForCall);
unsigned FrameID = MachineOutlinerDefault;
NumBytesToCreateFrame += 4;
bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
// We check to see if CFI Instructions are present, and if they are
// we find the number of CFI Instructions in the candidates.
unsigned CFICount = 0;
MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
const std::vector<MCCFIInstruction> &CFIInstructions =
if (MBBI->isCFIInstruction()) {
unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
MCCFIInstruction CFI = CFIInstructions[CFIIndex];
// We compare the number of found CFI Instructions to the number of CFI
// instructions in the parent function for each candidate. We must check this
// since if we outline one of the CFI instructions in a function, we have to
// outline them all for correctness. If we do not, the address offsets will be
// incorrect between the two sections of the program.
for (outliner::Candidate &C : RepeatedSequenceLocs) {
std::vector<MCCFIInstruction> CFIInstructions =
if (CFICount > 0 && CFICount != CFIInstructions.size())
return outliner::OutlinedFunction();
// Returns true if an instructions is safe to fix up, false otherwise.
auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
if (MI.isCall())
return true;
if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
!MI.readsRegister(AArch64::SP, &TRI))
return true;
// Any modification of SP will break our code to save/restore LR.
// FIXME: We could handle some instructions which add a constant
// offset to SP, with a bit more work.
if (MI.modifiesRegister(AArch64::SP, &TRI))
return false;
// At this point, we have a stack instruction that we might need to
// fix up. We'll handle it if it's a load or store.
if (MI.mayLoadOrStore()) {
const MachineOperand *Base; // Filled with the base operand of MI.
int64_t Offset; // Filled with the offset of MI.
bool OffsetIsScalable;
// Does it allow us to offset the base operand and is the base the
// register SP?
if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
!Base->isReg() || Base->getReg() != AArch64::SP)
return false;
// Fixe-up code below assumes bytes.
if (OffsetIsScalable)
return false;
// Find the minimum/maximum offset for this instruction and check
// if fixing it up would be in range.
int64_t MinOffset,
MaxOffset; // Unscaled offsets for the instruction.
TypeSize Scale(0U, false); // The scale to multiply the offsets by.
unsigned DummyWidth;
getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
Offset += 16; // Update the offset to what it would be if we outlined.
if (Offset < MinOffset * (int64_t)Scale.getFixedSize() ||
Offset > MaxOffset * (int64_t)Scale.getFixedSize())
return false;
// It's in range, so we can outline it.
return true;
// FIXME: Add handling for instructions like "add x0, sp, #8".
// We can't fix it up, so don't outline it.
return false;
// True if it's possible to fix up each stack instruction in this sequence.
// Important for frames/call variants that modify the stack.
bool AllStackInstrsSafe = std::all_of(
FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
// If the last instruction in any candidate is a terminator, then we should
// tail call all of the candidates.
if (RepeatedSequenceLocs[0].back()->isTerminator()) {
FrameID = MachineOutlinerTailCall;
NumBytesToCreateFrame = 0;
SetCandidateCallInfo(MachineOutlinerTailCall, 4);
else if (LastInstrOpcode == AArch64::BL ||
((LastInstrOpcode == AArch64::BLR ||
LastInstrOpcode == AArch64::BLRNoIP) &&
!HasBTI)) {
// FIXME: Do we need to check if the code after this uses the value of LR?
FrameID = MachineOutlinerThunk;
NumBytesToCreateFrame = 0;
SetCandidateCallInfo(MachineOutlinerThunk, 4);
else {
// We need to decide how to emit calls + frames. We can always emit the same
// frame if we don't need to save to the stack. If we have to save to the
// stack, then we need a different frame.
unsigned NumBytesNoStackCalls = 0;
std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
// Check if we have to save LR.
for (outliner::Candidate &C : RepeatedSequenceLocs) {
// If we have a noreturn caller, then we're going to be conservative and
// say that we have to save LR. If we don't have a ret at the end of the
// block, then we can't reason about liveness accurately.
// FIXME: We can probably do better than always disabling this in
// noreturn functions by fixing up the liveness info.
bool IsNoReturn =
// Is LR available? If so, we don't need a save.
if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
NumBytesNoStackCalls += 4;
C.setCallInfo(MachineOutlinerNoLRSave, 4);
// Is an unused register available? If so, we won't modify the stack, so
// we can outline with the same frame type as those that don't save LR.
else if (findRegisterToSaveLRTo(C)) {
NumBytesNoStackCalls += 12;
C.setCallInfo(MachineOutlinerRegSave, 12);
// Is SP used in the sequence at all? If not, we don't have to modify
// the stack, so we are guaranteed to get the same frame.
else if (C.UsedInSequence.available(AArch64::SP)) {
NumBytesNoStackCalls += 12;
C.setCallInfo(MachineOutlinerDefault, 12);
// If we outline this, we need to modify the stack. Pretend we don't
// outline this by saving all of its bytes.
else {
NumBytesNoStackCalls += SequenceSize;
// If there are no places where we have to save LR, then note that we
// don't have to update the stack. Otherwise, give every candidate the
// default call type, as long as it's safe to do so.
if (!AllStackInstrsSafe ||
NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
RepeatedSequenceLocs = CandidatesWithoutStackFixups;
FrameID = MachineOutlinerNoLRSave;
} else {
SetCandidateCallInfo(MachineOutlinerDefault, 12);
// If we dropped all of the candidates, bail out here.
if (RepeatedSequenceLocs.size() < 2) {
return outliner::OutlinedFunction();
// Does every candidate's MBB contain a call? If so, then we might have a call
// in the range.
if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
// Check if the range contains a call. These require a save + restore of the
// link register.
bool ModStackToSaveLR = false;
if (std::any_of(FirstCand.front(), FirstCand.back(),
[](const MachineInstr &MI) { return MI.isCall(); }))
ModStackToSaveLR = true;
// Handle the last instruction separately. If this is a tail call, then the
// last instruction is a call. We don't want to save + restore in this case.
// However, it could be possible that the last instruction is a call without
// it being valid to tail call this sequence. We should consider this as
// well.
else if (FrameID != MachineOutlinerThunk &&
FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
ModStackToSaveLR = true;
if (ModStackToSaveLR) {
// We can't fix up the stack. Bail out.
if (!AllStackInstrsSafe) {
return outliner::OutlinedFunction();
// Save + restore LR.
NumBytesToCreateFrame += 8;
// If we have CFI instructions, we can only outline if the outlined section
// can be a tail call
if (FrameID != MachineOutlinerTailCall && CFICount > 0)
return outliner::OutlinedFunction();
return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
NumBytesToCreateFrame, FrameID);
bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
const Function &F = MF.getFunction();
// Can F be deduplicated by the linker? If it can, don't outline from it.
if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
return false;
// Don't outline from functions with section markings; the program could
// expect that all the code is in the named section.
// FIXME: Allow outlining from multiple functions with the same section
// marking.
if (F.hasSection())
return false;
// Outlining from functions with redzones is unsafe since the outliner may
// modify the stack. Check if hasRedZone is true or unknown; if yes, don't
// outline from it.
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (!AFI || AFI->hasRedZone().getValueOr(true))
return false;
// FIXME: Teach the outliner to generate/handle Windows unwind info.
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
return false;
// It's safe to outline from MF.
return true;
bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
unsigned &Flags) const {
// Check if LR is available through all of the MBB. If it's not, then set
// a flag.
assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
"Suitable Machine Function for outlining must track liveness");
LiveRegUnits LRU(getRegisterInfo());
std::for_each(MBB.rbegin(), MBB.rend(),
[&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
// Check if each of the unsafe registers are available...
bool W16AvailableInBlock = LRU.available(AArch64::W16);
bool W17AvailableInBlock = LRU.available(AArch64::W17);
bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
// If all of these are dead (and not live out), we know we don't have to check
// them later.
if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
// Now, add the live outs to the set.
// If any of these registers is available in the MBB, but also a live out of
// the block, then we know outlining is unsafe.
if (W16AvailableInBlock && !LRU.available(AArch64::W16))
return false;
if (W17AvailableInBlock && !LRU.available(AArch64::W17))
return false;
if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
return false;
// Check if there's a call inside this MachineBasicBlock. If there is, then
// set a flag.
if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
Flags |= MachineOutlinerMBBFlags::HasCalls;
MachineFunction *MF = MBB.getParent();
// In the event that we outline, we may have to save LR. If there is an
// available register in the MBB, then we'll always save LR there. Check if
// this is true.
bool CanSaveLR = false;
const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
// Check if there is an available register across the sequence that we can
// use.
for (unsigned Reg : AArch64::GPR64RegClass) {
if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
CanSaveLR = true;
// Check if we have a register we can save LR to, and if LR was used
// somewhere. If both of those things are true, then we need to evaluate the
// safety of outlining stack instructions later.
if (!CanSaveLR && !LRU.available(AArch64::LR))
Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
return true;
AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
unsigned Flags) const {
MachineInstr &MI = *MIT;
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
// Don't outline anything used for return address signing. The outlined
// function will get signed later if needed
switch (MI.getOpcode()) {
case AArch64::PACIASP:
case AArch64::PACIBSP:
case AArch64::AUTIASP:
case AArch64::AUTIBSP:
case AArch64::RETAA:
case AArch64::RETAB:
case AArch64::EMITBKEY:
return outliner::InstrType::Illegal;
// Don't outline LOHs.
if (FuncInfo->getLOHRelated().count(&MI))
return outliner::InstrType::Illegal;
// We can only outline these if we will tail call the outlined function, or
// fix up the CFI offsets. Currently, CFI instructions are outlined only if
// in a tail call.
// FIXME: If the proper fixups for the offset are implemented, this should be
// possible.
if (MI.isCFIInstruction())
return outliner::InstrType::Legal;
// Don't allow debug values to impact outlining type.
if (MI.isDebugInstr() || MI.isIndirectDebugValue())
return outliner::InstrType::Invisible;
// At this point, KILL instructions don't really tell us much so we can go
// ahead and skip over them.
if (MI.isKill())
return outliner::InstrType::Invisible;
// Is this a terminator for a basic block?
if (MI.isTerminator()) {
// Is this the end of a function?
if (MI.getParent()->succ_empty())
return outliner::InstrType::Legal;
// It's not, so don't outline it.
return outliner::InstrType::Illegal;
// Make sure none of the operands are un-outlinable.
for (const MachineOperand &MOP : MI.operands()) {
if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
return outliner::InstrType::Illegal;
// If it uses LR or W30 explicitly, then don't touch it.
if (MOP.isReg() && !MOP.isImplicit() &&
(MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
return outliner::InstrType::Illegal;
// Special cases for instructions that can always be outlined, but will fail
// the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
// be outlined because they don't require a *specific* value to be in LR.
if (MI.getOpcode() == AArch64::ADRP)
return outliner::InstrType::Legal;
// If MI is a call we might be able to outline it. We don't want to outline
// any calls that rely on the position of items on the stack. When we outline
// something containing a call, we have to emit a save and restore of LR in
// the outlined function. Currently, this always happens by saving LR to the
// stack. Thus, if we outline, say, half the parameters for a function call
// plus the call, then we'll break the callee's expectations for the layout
// of the stack.
// FIXME: Allow calls to functions which construct a stack frame, as long
// as they don't access arguments on the stack.
// FIXME: Figure out some way to analyze functions defined in other modules.
// We should be able to compute the memory usage based on the IR calling
// convention, even if we can't see the definition.
if (MI.isCall()) {
// Get the function associated with the call. Look at each operand and find
// the one that represents the callee and get its name.
const Function *Callee = nullptr;
for (const MachineOperand &MOP : MI.operands()) {
if (MOP.isGlobal()) {
Callee = dyn_cast<Function>(MOP.getGlobal());
// Never outline calls to mcount. There isn't any rule that would require
// this, but the Linux kernel's "ftrace" feature depends on it.
if (Callee && Callee->getName() == "\01_mcount")
return outliner::InstrType::Illegal;
// If we don't know anything about the callee, assume it depends on the
// stack layout of the caller. In that case, it's only legal to outline
// as a tail-call. Explicitly list the call instructions we know about so we
// don't get unexpected results with call pseudo-instructions.
auto UnknownCallOutlineType = outliner::InstrType::Illegal;
if (MI.getOpcode() == AArch64::BLR ||
MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
if (!Callee)
return UnknownCallOutlineType;
// We have a function we have information about. Check it if it's something
// can safely outline.
MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
// We don't know what's going on with the callee at all. Don't touch it.
if (!CalleeMF)
return UnknownCallOutlineType;
// Check if we know anything about the callee saves on the function. If we
// don't, then don't touch it, since that implies that we haven't
// computed anything about its stack frame yet.
MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
MFI.getNumObjects() > 0)
return UnknownCallOutlineType;
// At this point, we can say that CalleeMF ought to not pass anything on the
// stack. Therefore, we can outline it.
return outliner::InstrType::Legal;
// Don't outline positions.
if (MI.isPosition())
return outliner::InstrType::Illegal;
// Don't touch the link register or W30.
if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
return outliner::InstrType::Illegal;
// Don't outline BTI instructions, because that will prevent the outlining
// site from being indirectly callable.
if (MI.getOpcode() == AArch64::HINT) {
int64_t Imm = MI.getOperand(0).getImm();
if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
return outliner::InstrType::Illegal;
return outliner::InstrType::Legal;
void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
for (MachineInstr &MI : MBB) {
const MachineOperand *Base;
unsigned Width;
int64_t Offset;
bool OffsetIsScalable;
// Is this a load or store with an immediate offset with SP as the base?
if (!MI.mayLoadOrStore() ||
!getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
&RI) ||
(Base->isReg() && Base->getReg() != AArch64::SP))
// It is, so we have to fix it up.
TypeSize Scale(0U, false);
int64_t Dummy1, Dummy2;
MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
assert(Scale != 0 && "Unexpected opcode!");
assert(!OffsetIsScalable && "Expected offset to be a byte offset");
// We've pushed the return address to the stack, so add 16 to the offset.
// This is safe, since we already checked if it would overflow when we
// checked if this instruction was legal to outline.
int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize();
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
bool ShouldSignReturnAddr,
bool ShouldSignReturnAddrWithAKey) {
if (ShouldSignReturnAddr) {
MachineBasicBlock::iterator MBBPAC = MBB.begin();
MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL;
if (MBBAUT != MBB.end())
DL = MBBAUT->getDebugLoc();
// At the very beginning of the basic block we insert the following
// depending on the key type
// a_key: b_key:
if (ShouldSignReturnAddrWithAKey) {
BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
} else {
BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
unsigned CFIIndex =
BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
// If v8.3a features are available we can replace a RET instruction by
// RETAA or RETAB and omit the AUT instructions
if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
MBBAUT->getOpcode() == AArch64::RET) {
TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
: AArch64::RETAB))
} else {
TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
: AArch64::AUTIBSP))
void AArch64InstrInfo::buildOutlinedFrame(
MachineBasicBlock &MBB, MachineFunction &MF,
const outliner::OutlinedFunction &OF) const {
AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
if (OF.FrameConstructionID == MachineOutlinerTailCall)
FI->setOutliningStyle("Tail Call");
else if (OF.FrameConstructionID == MachineOutlinerThunk) {
// For thunk outlining, rewrite the last instruction from a call to a
// tail-call.
MachineInstr *Call = &*--MBB.instr_end();
unsigned TailOpcode;
if (Call->getOpcode() == AArch64::BL) {
TailOpcode = AArch64::TCRETURNdi;
} else {
assert(Call->getOpcode() == AArch64::BLR ||
Call->getOpcode() == AArch64::BLRNoIP);
TailOpcode = AArch64::TCRETURNriALL;
MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
MBB.insert(MBB.end(), TC);
bool IsLeafFunction = true;
// Is there a call in the outlined range?
auto IsNonTailCall = [](const MachineInstr &MI) {
return MI.isCall() && !MI.isReturn();
if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
// Fix up the instructions in the range, since we're going to modify the
// stack.
assert(OF.FrameConstructionID != MachineOutlinerDefault &&
"Can only fix up stack references once");
IsLeafFunction = false;
// LR has to be a live in so that we can save it.
if (!MBB.isLiveIn(AArch64::LR))
MachineBasicBlock::iterator It = MBB.begin();
MachineBasicBlock::iterator Et = MBB.end();
if (OF.FrameConstructionID == MachineOutlinerTailCall ||
OF.FrameConstructionID == MachineOutlinerThunk)
Et = std::prev(MBB.end());
// Insert a save before the outlined region
MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
.addReg(AArch64::SP, RegState::Define)
It = MBB.insert(It, STRXpre);
const TargetSubtargetInfo &STI = MF.getSubtarget();
const MCRegisterInfo *MRI = STI.getRegisterInfo();
unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
// Add a CFI saying the stack was moved 16 B down.
int64_t StackPosEntry =
MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
// Add a CFI saying that the LR that we want to find is now 16 B higher than
// before.
int64_t LRPosEntry =
MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
// Insert a restore before the terminator for the function.
MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
.addReg(AArch64::SP, RegState::Define)
.addReg(AArch64::LR, RegState::Define)
Et = MBB.insert(Et, LDRXpost);
// If a bunch of candidates reach this point they must agree on their return
// address signing. It is therefore enough to just consider the signing
// behaviour of one of them
const Function &CF = OF.Candidates.front().getMF()->getFunction();
bool ShouldSignReturnAddr = false;
if (CF.hasFnAttribute("sign-return-address")) {
StringRef Scope =
if (Scope.equals("all"))
ShouldSignReturnAddr = true;
else if (Scope.equals("non-leaf") && !IsLeafFunction)
ShouldSignReturnAddr = true;
// a_key is the default
bool ShouldSignReturnAddrWithAKey = true;
if (CF.hasFnAttribute("sign-return-address-key")) {
const StringRef Key =
// Key can either be a_key or b_key
assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) &&
"Return address signing key must be either a_key or b_key");
ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
// If this is a tail call outlined function, then there's already a return.
if (OF.FrameConstructionID == MachineOutlinerTailCall ||
OF.FrameConstructionID == MachineOutlinerThunk) {
signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
// It's not a tail call, so we have to insert the return ourselves.
// LR has to be a live in so that we can return to it.
if (!MBB.isLiveIn(AArch64::LR))
MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
MBB.insert(MBB.end(), ret);
signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
// Did we have to modify the stack by saving the link register?
if (OF.FrameConstructionID != MachineOutlinerDefault)
// We modified the stack.
// Walk over the basic block and fix up all the stack accesses.
MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
MachineFunction &MF, const outliner::Candidate &C) const {
// Are we tail calling?
if (C.CallConstructionID == MachineOutlinerTailCall) {
// If yes, then we can just branch to the label.
It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
return It;
// Are we saving the link register?
if (C.CallConstructionID == MachineOutlinerNoLRSave ||
C.CallConstructionID == MachineOutlinerThunk) {
// No, so just insert the call.
It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
return It;
// We want to return the spot where we inserted the call.
MachineBasicBlock::iterator CallPt;
// Instructions for saving and restoring LR around the call instruction we're
// going to insert.
MachineInstr *Save;
MachineInstr *Restore;
// Can we save to a register?
if (C.CallConstructionID == MachineOutlinerRegSave) {
// FIXME: This logic should be sunk into a target-specific interface so that
// we don't have to recompute the register.
unsigned Reg = findRegisterToSaveLRTo(C);
assert(Reg != 0 && "No callee-saved register available?");
// Save and restore LR from that register.
Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
} else {
// We have the default case. Save and restore from SP.
Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
.addReg(AArch64::SP, RegState::Define)
Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
.addReg(AArch64::SP, RegState::Define)
.addReg(AArch64::LR, RegState::Define)
It = MBB.insert(It, Save);
// Insert the call.
It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
CallPt = It;
It = MBB.insert(It, Restore);
return CallPt;
bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
MachineFunction &MF) const {
return MF.getFunction().hasMinSize();
AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
// AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
// and zero immediate operands used as an alias for mov instruction.
if (MI.getOpcode() == AArch64::ORRWrs &&
MI.getOperand(1).getReg() == AArch64::WZR &&
MI.getOperand(3).getImm() == 0x0) {
return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
if (MI.getOpcode() == AArch64::ORRXrs &&
MI.getOperand(1).getReg() == AArch64::XZR &&
MI.getOperand(3).getImm() == 0x0) {
return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
return None;
Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
Register Reg) const {
int Sign = 1;
int64_t Offset = 0;
// TODO: Handle cases where Reg is a super- or sub-register of the
// destination register.
const MachineOperand &Op0 = MI.getOperand(0);
if (!Op0.isReg() || Reg != Op0.getReg())
return None;
switch (MI.getOpcode()) {
return None;
case AArch64::SUBWri:
case AArch64::SUBXri:
case AArch64::SUBSWri:
case AArch64::SUBSXri:
Sign *= -1;
case AArch64::ADDSWri:
case AArch64::ADDSXri:
case AArch64::ADDWri:
case AArch64::ADDXri: {
// TODO: Third operand can be global address (usually some string).
if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
return None;
Offset = MI.getOperand(2).getImm() * Sign;
int Shift = MI.getOperand(3).getImm();
assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
Offset = Offset << Shift;
return RegImmPair{MI.getOperand(1).getReg(), Offset};
/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
/// the destination register then, if possible, describe the value in terms of
/// the source register.
static Optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) {
auto DestSrc = TII->isCopyInstr(MI);
if (!DestSrc)
return None;
Register DestReg = DestSrc->Destination->getReg();
Register SrcReg = DestSrc->Source->getReg();
auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
// If the described register is the destination, just return the source.
if (DestReg == DescribedReg)
return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
// ORRWrs zero-extends to 64-bits, so we need to consider such cases.
if (MI.getOpcode() == AArch64::ORRWrs &&
TRI->isSuperRegister(DestReg, DescribedReg))
return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
// We may need to describe the lower part of a ORRXrs move.
if (MI.getOpcode() == AArch64::ORRXrs &&
TRI->isSubRegister(DestReg, DescribedReg)) {
Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
"Unhandled ORR[XW]rs copy case");
return None;
AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
Register Reg) const {
const MachineFunction *MF = MI.getMF();
const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
switch (MI.getOpcode()) {
case AArch64::MOVZWi:
case AArch64::MOVZXi: {
// MOVZWi may be used for producing zero-extended 32-bit immediates in
// 64-bit parameters, so we need to consider super-registers.
if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
return None;
if (!MI.getOperand(1).isImm())
return None;
int64_t Immediate = MI.getOperand(1).getImm();
int Shift = MI.getOperand(2).getImm();
return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
case AArch64::ORRWrs:
case AArch64::ORRXrs:
return describeORRLoadedValue(MI, Reg, this, TRI);
return TargetInstrInfo::describeLoadedValue(MI, Reg);
uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
return get(Opc).TSFlags & AArch64::ElementSizeMask;
unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
return AArch64::BLRNoIP;
return AArch64::BLR;
#include ""
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 886158ca4490..83a488afc797 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1,679 +1,705 @@
//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file contains the AArch64 implementation of the TargetRegisterInfo
// class.
#include "AArch64RegisterInfo.h"
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64StackOffset.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
#include ""
AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
: AArch64GenRegisterInfo(AArch64::LR), TT(TT) {
+static bool hasSVEArgsOrReturn(const MachineFunction *MF) {
+ const Function &F = MF->getFunction();
+ return isa<ScalableVectorType>(F.getReturnType()) ||
+ any_of(F.args(), [](const Argument &Arg) {
+ return isa<ScalableVectorType>(Arg.getType());
+ });
const MCPhysReg *
AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
// GHC set of callee saved regs is empty as all those regs are
// used for passing STG regs around
return CSR_AArch64_NoRegs_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
return CSR_AArch64_AllRegs_SaveList;
// Darwin has its own CSR_AArch64_AAPCS_SaveList, which means most CSR save
// lists depending on that will need to have their Darwin variant as well.
if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
return getDarwinCalleeSavedRegs(MF);
if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
return CSR_Win_AArch64_CFGuard_Check_SaveList;
if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
return CSR_Win_AArch64_AAPCS_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
return CSR_AArch64_AAVPCS_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
return CSR_AArch64_SVE_AAPCS_SaveList;
if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
->supportSwiftError() &&
return CSR_AArch64_AAPCS_SwiftError_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
return CSR_AArch64_RT_MostRegs_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::Win64)
// This is for OSes other than Windows; Windows is a separate case further
// above.
return CSR_AArch64_AAPCS_X18_SaveList;
+ if (hasSVEArgsOrReturn(MF))
+ return CSR_AArch64_SVE_AAPCS_SaveList;
return CSR_AArch64_AAPCS_SaveList;
const MCPhysReg *
AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
"Invalid subtarget for getDarwinCalleeSavedRegs");
if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
"Calling convention CFGuard_Check is unsupported on Darwin.");
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
return CSR_Darwin_AArch64_AAVPCS_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
"Calling convention SVE_VectorCall is unsupported on Darwin.");
if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()
? CSR_Darwin_AArch64_CXX_TLS_PE_SaveList
: CSR_Darwin_AArch64_CXX_TLS_SaveList;
if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
->supportSwiftError() &&
return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
return CSR_Darwin_AArch64_RT_MostRegs_SaveList;
return CSR_Darwin_AArch64_AAPCS_SaveList;
const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
return CSR_Darwin_AArch64_CXX_TLS_ViaCopy_SaveList;
return nullptr;
void AArch64RegisterInfo::UpdateCustomCalleeSavedRegs(
MachineFunction &MF) const {
const MCPhysReg *CSRs = getCalleeSavedRegs(&MF);
SmallVector<MCPhysReg, 32> UpdatedCSRs;
for (const MCPhysReg *I = CSRs; *I; ++I)
for (size_t i = 0; i < AArch64::GPR64commonRegClass.getNumRegs(); ++i) {
if (MF.getSubtarget<AArch64Subtarget>().isXRegCustomCalleeSaved(i)) {
// Register lists are zero-terminated.
const TargetRegisterClass *
AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
unsigned Idx) const {
// edge case for GPR/FPR register classes
if (RC == &AArch64::GPR32allRegClass && Idx == AArch64::hsub)
return &AArch64::FPR32RegClass;
else if (RC == &AArch64::GPR64allRegClass && Idx == AArch64::hsub)
return &AArch64::FPR64RegClass;
// Forward to TableGen's default version.
return AArch64GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
const uint32_t *
AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
assert(MF.getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
"Invalid subtarget for getDarwinCallPreservedMask");
if (CC == CallingConv::CXX_FAST_TLS)
return CSR_Darwin_AArch64_CXX_TLS_RegMask;
if (CC == CallingConv::AArch64_VectorCall)
return CSR_Darwin_AArch64_AAVPCS_RegMask;
if (CC == CallingConv::AArch64_SVE_VectorCall)
"Calling convention SVE_VectorCall is unsupported on Darwin.");
if (CC == CallingConv::CFGuard_Check)
"Calling convention CFGuard_Check is unsupported on Darwin.");
if (MF.getSubtarget<AArch64Subtarget>()
->supportSwiftError() &&
return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask;
if (CC == CallingConv::PreserveMost)
return CSR_Darwin_AArch64_RT_MostRegs_RegMask;
return CSR_Darwin_AArch64_AAPCS_RegMask;
const uint32_t *
AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
bool SCS = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
if (CC == CallingConv::GHC)
// This is academic because all GHC calls are (supposed to be) tail calls
return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask;
if (CC == CallingConv::AnyReg)
return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask;
// All the following calling conventions are handled differently on Darwin.
if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
if (SCS)
report_fatal_error("ShadowCallStack attribute not supported on Darwin.");
return getDarwinCallPreservedMask(MF, CC);
if (CC == CallingConv::AArch64_VectorCall)
return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
if (CC == CallingConv::AArch64_SVE_VectorCall)
return SCS ? CSR_AArch64_SVE_AAPCS_SCS_RegMask
: CSR_AArch64_SVE_AAPCS_RegMask;
if (CC == CallingConv::CFGuard_Check)
return CSR_Win_AArch64_CFGuard_Check_RegMask;
if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
->supportSwiftError() &&
return SCS ? CSR_AArch64_AAPCS_SwiftError_SCS_RegMask
: CSR_AArch64_AAPCS_SwiftError_RegMask;
if (CC == CallingConv::PreserveMost)
return SCS ? CSR_AArch64_RT_MostRegs_SCS_RegMask
: CSR_AArch64_RT_MostRegs_RegMask;
return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask;
const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
if (TT.isOSDarwin())
return CSR_Darwin_AArch64_TLS_RegMask;
assert(TT.isOSBinFormatELF() && "Invalid target");
return CSR_AArch64_TLS_ELF_RegMask;
void AArch64RegisterInfo::UpdateCustomCallPreservedMask(MachineFunction &MF,
const uint32_t **Mask) const {
uint32_t *UpdatedMask = MF.allocateRegMask();
unsigned RegMaskSize = MachineOperand::getRegMaskSize(getNumRegs());
memcpy(UpdatedMask, *Mask, sizeof(UpdatedMask[0]) * RegMaskSize);
for (size_t i = 0; i < AArch64::GPR64commonRegClass.getNumRegs(); ++i) {
if (MF.getSubtarget<AArch64Subtarget>().isXRegCustomCalleeSaved(i)) {
for (MCSubRegIterator SubReg(AArch64::GPR64commonRegClass.getRegister(i),
this, true);
SubReg.isValid(); ++SubReg) {
// See TargetRegisterInfo::getCallPreservedMask for how to interpret the
// register mask.
UpdatedMask[*SubReg / 32] |= 1u << (*SubReg % 32);
*Mask = UpdatedMask;
const uint32_t *AArch64RegisterInfo::getNoPreservedMask() const {
return CSR_AArch64_NoRegs_RegMask;
const uint32_t *
AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
// This should return a register mask that is the same as that returned by
// getCallPreservedMask but that additionally preserves the register used for
// the first i64 argument (which must also be the register used to return a
// single i64 return value)
// In case that the calling convention does not use the same register for
// both, the function should return NULL (does not currently apply)
assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin())
return CSR_Darwin_AArch64_AAPCS_ThisReturn_RegMask;
return CSR_AArch64_AAPCS_ThisReturn_RegMask;
const uint32_t *AArch64RegisterInfo::getWindowsStackProbePreservedMask() const {
return CSR_AArch64_StackProbe_Windows_RegMask;
AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
// FIXME: avoid re-calculating this every time.
BitVector Reserved(getNumRegs());
markSuperRegs(Reserved, AArch64::WSP);
markSuperRegs(Reserved, AArch64::WZR);
if (TFI->hasFP(MF) || TT.isOSDarwin())
markSuperRegs(Reserved, AArch64::W29);
for (size_t i = 0; i < AArch64::GPR32commonRegClass.getNumRegs(); ++i) {
if (MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(i))
markSuperRegs(Reserved, AArch64::GPR32commonRegClass.getRegister(i));
if (hasBasePointer(MF))
markSuperRegs(Reserved, AArch64::W19);
// SLH uses register W16/X16 as the taint register.
if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
markSuperRegs(Reserved, AArch64::W16);
return Reserved;
bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
MCRegister Reg) const {
return getReservedRegs(MF)[Reg];
bool AArch64RegisterInfo::isAnyArgRegReserved(const MachineFunction &MF) const {
return std::any_of(std::begin(*AArch64::GPR64argRegClass.MC),
[this, &MF](MCPhysReg r){return isReservedReg(MF, r);});
void AArch64RegisterInfo::emitReservedArgRegCallError(
const MachineFunction &MF) const {
const Function &F = MF.getFunction();
F.getContext().diagnose(DiagnosticInfoUnsupported{F, "AArch64 doesn't support"
" function calls if any of the argument registers is reserved."});
bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
MCRegister PhysReg) const {
return !isReservedReg(MF, PhysReg);
bool AArch64RegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
return PhysReg == AArch64::WZR || PhysReg == AArch64::XZR;
const TargetRegisterClass *
AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
unsigned Kind) const {
return &AArch64::GPR64spRegClass;
const TargetRegisterClass *
AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
if (RC == &AArch64::CCRRegClass)
return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
return RC;
unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
// In the presence of variable sized objects or funclets, if the fixed stack
// size is large enough that referencing from the FP won't result in things
// being in range relatively often, we can use a base pointer to allow access
// from the other direction like the SP normally works.
// Furthermore, if both variable sized objects are present, and the
// stack needs to be dynamically re-aligned, the base pointer is the only
// reliable way to reference the locals.
if (MFI.hasVarSizedObjects() || MF.hasEHFunclets()) {
if (needsStackRealignment(MF))
return true;
+ if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ // Frames that have variable sized objects and scalable SVE objects,
+ // should always use a basepointer.
+ if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE())
+ return true;
+ }
// Conservatively estimate whether the negative offset from the frame
// pointer will be sufficient to reach. If a function has a smallish
// frame, it's less likely to have lots of spills and callee saved
// space, so it's all more likely to be within range of the frame pointer.
// If it's wrong, we'll materialize the constant and still get to the
// object; it's just suboptimal. Negative offsets use the unscaled
// load/store instructions, which have a 9-bit signed immediate.
return MFI.getLocalFrameSize() >= 256;
return false;
AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
bool AArch64RegisterInfo::requiresRegisterScavenging(
const MachineFunction &MF) const {
return true;
bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
const MachineFunction &MF) const {
return true;
AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
// This function indicates whether the emergency spillslot should be placed
// close to the beginning of the stackframe (closer to FP) or the end
// (closer to SP).
// The beginning works most reliably if we have a frame pointer.
+ // In the presence of any non-constant space between FP and locals,
+ // (e.g. in case of stack realignment or a scalable SVE area), it is
+ // better to use SP or BP.
const AArch64FrameLowering &TFI = *getFrameLowering(MF);
- return TFI.hasFP(MF);
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() ||
+ AFI->hasCalculatedStackSizeSVE()) &&
+ "Expected SVE area to be calculated by this point");
+ return TFI.hasFP(MF) && !needsStackRealignment(MF) && !AFI->getStackSizeSVE();
bool AArch64RegisterInfo::requiresFrameIndexScavenging(
const MachineFunction &MF) const {
return true;
AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI.adjustsStack())
return true;
return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken();
/// needsFrameBaseReg - Returns true if the instruction's frame index
/// reference would be better served by a base register other than FP
/// or SP. Used by LocalStackFrameAllocation to determine which frame index
/// references it should create new base registers for.
bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
int64_t Offset) const {
for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
assert(i < MI->getNumOperands() &&
"Instr doesn't have FrameIndex operand!");
// It's the load/store FI references that cause issues, as it can be difficult
// to materialize the offset if it won't fit in the literal field. Estimate
// based on the size of the local frame and some conservative assumptions
// about the rest of the stack frame (note, this is pre-regalloc, so
// we don't know everything for certain yet) whether this offset is likely
// to be out of range of the immediate. Return true if so.
// We only generate virtual base registers for loads and stores, so
// return false for everything else.
if (!MI->mayLoad() && !MI->mayStore())
return false;
// Without a virtual base register, if the function has variable sized
// objects, all fixed-size local references will be via the frame pointer,
// Approximate the offset and see if it's legal for the instruction.
// Note that the incoming offset is based on the SP value at function entry,
// so it'll be negative.
MachineFunction &MF = *MI->getParent()->getParent();
const AArch64FrameLowering *TFI = getFrameLowering(MF);
MachineFrameInfo &MFI = MF.getFrameInfo();
// Estimate an offset from the frame pointer.
// Conservatively assume all GPR callee-saved registers get pushed.
// FP, LR, X19-X28, D8-D15. 64-bits each.
int64_t FPOffset = Offset - 16 * 20;
// Estimate an offset from the stack pointer.
// The incoming offset is relating to the SP at the start of the function,
// but when we access the local it'll be relative to the SP after local
// allocation, so adjust our SP-relative offset by that allocation size.
Offset += MFI.getLocalFrameSize();
// Assume that we'll have at least some spill slots allocated.
// FIXME: This is a total SWAG number. We should run some statistics
// and pick a real one.
Offset += 128; // 128 bytes of spill slots
// If there is a frame pointer, try using it.
// The FP is only available if there is no dynamic realignment. We
// don't know for sure yet whether we'll need that, so we guess based
// on whether there are any local variables that would trigger it.
if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, AArch64::FP, FPOffset))
return false;
// If we can reference via the stack pointer or base pointer, try that.
// FIXME: This (and the code that resolves the references) can be improved
// to only disallow SP relative references in the live range of
// the VLA(s). In practice, it's unclear how much difference that
// would make, but it may be worth doing.
if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
return false;
// If even offset 0 is illegal, we don't want a virtual base register.
if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
return false;
// The offset likely isn't legal; we want to allocate a virtual base register.
return true;
bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
Register BaseReg,
int64_t Offset) const {
assert(MI && "Unable to get the legal offset for nil instruction.");
StackOffset SaveOffset(Offset, MVT::i8);
return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
/// at the beginning of the basic block.
void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
Register BaseReg,
int FrameIdx,
int64_t Offset) const {
MachineBasicBlock::iterator Ins = MBB->begin();
DebugLoc DL; // Defaults to "unknown"
if (Ins != MBB->end())
DL = Ins->getDebugLoc();
const MachineFunction &MF = *MBB->getParent();
const AArch64InstrInfo *TII =
const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
BuildMI(*MBB, Ins, DL, MCID, BaseReg)
void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const {
// ARM doesn't need the general 64-bit offsets
StackOffset Off(Offset, MVT::i8);
unsigned i = 0;
while (!MI.getOperand(i).isFI()) {
assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
const MachineFunction *MF = MI.getParent()->getParent();
const AArch64InstrInfo *TII =
bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
assert(Done && "Unable to resolve frame index!");
// Create a scratch register for the frame index elimination in an instruction.
// This function has special handling of stack tagging loop pseudos, in which
// case it can also change the instruction opcode (but not the operands).
static Register
createScratchRegisterForInstruction(MachineInstr &MI,
const AArch64InstrInfo *TII) {
// ST*Gloop have a reserved scratch register in operand 1. Use it, and also
// replace the instruction with the writeback variant because it will now
// satisfy the operand constraints for it.
if (MI.getOpcode() == AArch64::STGloop) {
return MI.getOperand(1).getReg();
} else if (MI.getOpcode() == AArch64::STZGloop) {
return MI.getOperand(1).getReg();
} else {
return MI.getMF()->getRegInfo().createVirtualRegister(
void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
assert(SPAdj == 0 && "Unexpected");
MachineInstr &MI = *II;
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64InstrInfo *TII =
const AArch64FrameLowering *TFI = getFrameLowering(MF);
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
bool Tagged =
MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
Register FrameReg;
// Special handling of dbg_value, stackmap and patchpoint instructions.
if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
StackOffset Offset =
TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
Offset += StackOffset(MI.getOperand(FIOperandNum + 1).getImm(), MVT::i8);
MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getBytes());
if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) {
MachineOperand &FI = MI.getOperand(FIOperandNum);
int Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex);
StackOffset Offset;
if (MI.getOpcode() == AArch64::TAGPstack) {
// TAGPstack must use the virtual frame register in its 3rd operand.
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
FrameReg = MI.getOperand(3).getReg();
Offset = {MFI.getObjectOffset(FrameIndex) +
} else if (Tagged) {
StackOffset SPOffset = {
MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(), MVT::i8};
if (MFI.hasVarSizedObjects() ||
isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) !=
(AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) {
// Can't update to SP + offset in place. Precalculate the tagged pointer
// in a scratch register.
Offset = TFI->resolveFrameIndexReference(
MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
Register ScratchReg =
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
.ChangeToRegister(ScratchReg, false, false, true);
FrameReg = AArch64::SP;
Offset = {MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize(),
} else {
Offset = TFI->resolveFrameIndexReference(
MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
// Modify MI as necessary to handle as much of 'Offset' as possible
if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
"Emergency spill slot is out of reach");
// If we get here, the immediate doesn't fit into the instruction. We folded
// as much as possible above. Handle the rest, providing a register that is
// SP+LargeImm.
Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
const AArch64FrameLowering *TFI = getFrameLowering(MF);
switch (RC->getID()) {
return 0;
case AArch64::GPR32RegClassID:
case AArch64::GPR32spRegClassID:
case AArch64::GPR32allRegClassID:
case AArch64::GPR64spRegClassID:
case AArch64::GPR64allRegClassID:
case AArch64::GPR64RegClassID:
case AArch64::GPR32commonRegClassID:
case AArch64::GPR64commonRegClassID:
return 32 - 1 // XZR/SP
- (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
- MF.getSubtarget<AArch64Subtarget>().getNumXRegisterReserved()
- hasBasePointer(MF); // X19
case AArch64::FPR8RegClassID:
case AArch64::FPR16RegClassID:
case AArch64::FPR32RegClassID:
case AArch64::FPR64RegClassID:
case AArch64::FPR128RegClassID:
return 32;
case AArch64::DDRegClassID:
case AArch64::DDDRegClassID:
case AArch64::DDDDRegClassID:
case AArch64::QQRegClassID:
case AArch64::QQQRegClassID:
case AArch64::QQQQRegClassID:
return 32;
case AArch64::FPR128_loRegClassID:
case AArch64::FPR64_loRegClassID:
case AArch64::FPR16_loRegClassID:
return 16;
unsigned AArch64RegisterInfo::getLocalAddressRegister(
const MachineFunction &MF) const {
const auto &MFI = MF.getFrameInfo();
if (!MF.hasEHFunclets() && !MFI.hasVarSizedObjects())
return AArch64::SP;
else if (needsStackRealignment(MF))
return getBaseRegister();
return getFrameRegister(MF);
diff --git a/llvm/lib/Target/AArch64/ b/llvm/lib/Target/AArch64/
index 28a54e6f7d79..3449a8bd16d2 100644
--- a/llvm/lib/Target/AArch64/
+++ b/llvm/lib/Target/AArch64/
@@ -1,2583 +1,2605 @@
//=- - AArch64 SVE Instructions -*- tablegen -*-----=//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// AArch64 Scalable Vector Extension (SVE) Instruction definitions.
// For predicated nodes where the entire operation is controlled by a governing
// predicate, please stick to a similar naming convention as used for the
// ISD nodes:
// SDNode <=> AArch64ISD
// -------------------------------
// _m<n> <=> _MERGE_OP<n>
// _mt <=> _MERGE_PASSTHRU
// _z <=> _MERGE_ZERO
// _p <=> _PRED
// Given the context of this file, it is not strictly necessary to use _p to
// distinguish predicated from unpredicated nodes given that most SVE
// instructions are predicated.
// Contiguous loads - node definitions
def SDT_AArch64_LD1 : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
def AArch64ld1_z : SDNode<"AArch64ISD::LD1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1s_z : SDNode<"AArch64ISD::LD1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
// Non-faulting & first-faulting loads - node definitions
def AArch64ldnf1_z : SDNode<"AArch64ISD::LDNF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1_z : SDNode<"AArch64ISD::LDFF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldnf1s_z : SDNode<"AArch64ISD::LDNF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1s_z : SDNode<"AArch64ISD::LDFF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
// Contiguous load and replicate - node definitions
def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;
// Gather loads - node definitions
def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
def SDT_AArch64_GATHER_VS : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
def AArch64ld1_gather_z : SDNode<"AArch64ISD::GLD1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1_gather_scaled_z : SDNode<"AArch64ISD::GLD1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1_gather_uxtw_z : SDNode<"AArch64ISD::GLD1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1_gather_sxtw_z : SDNode<"AArch64ISD::GLD1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1_gather_imm_z : SDNode<"AArch64ISD::GLD1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1s_gather_z : SDNode<"AArch64ISD::GLD1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1s_gather_scaled_z : SDNode<"AArch64ISD::GLD1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1s_gather_uxtw_z : SDNode<"AArch64ISD::GLD1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1s_gather_sxtw_z : SDNode<"AArch64ISD::GLD1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ld1s_gather_imm_z : SDNode<"AArch64ISD::GLD1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ldff1_gather_z : SDNode<"AArch64ISD::GLDFF1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1_gather_imm_z : SDNode<"AArch64ISD::GLDFF1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1s_gather_z : SDNode<"AArch64ISD::GLDFF1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1s_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1s_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1s_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldff1s_gather_imm_z : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
// Contiguous stores - node definitions
def SDT_AArch64_ST1 : SDTypeProfile<0, 4, [
SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>,
SDTCVecEltisVT<2,i1>, SDTCisSameNumEltsAs<0,2>
def AArch64st1 : SDNode<"AArch64ISD::ST1_PRED", SDT_AArch64_ST1, [SDNPHasChain, SDNPMayStore]>;
// Scatter stores - node definitions
def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
def SDT_AArch64_SCATTER_VS : SDTypeProfile<0, 5, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
def AArch64st1_scatter : SDNode<"AArch64ISD::SST1_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
// AArch64 SVE/SVE2 - the remaining node definitions
def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">;
def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">;
def sve_cntw_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 4>">;
def sve_cntd_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 2>">;
def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">;
def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;
def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>;
def AArch64faddv_p : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>;
def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>;
def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>;
def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>;
def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>;
def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>;
def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>;
def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>;
def AArch64uminv_p : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>;
def AArch64orv_p : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>;
def AArch64eorv_p : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>;
def AArch64andv_p : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>;
def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;
def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;
def SDT_AArch64Arith : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
def SDT_AArch64FMA : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>,
SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
// Predicated operations with the result of inactive lanes being unspecified.
def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>;
def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>;
def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
// Merging op1 into the inactive lanes.
def AArch64smin_m1 : SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>;
def AArch64umin_m1 : SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>;
def AArch64smax_m1 : SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>;
def AArch64umax_m1 : SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>;
def AArch64lsl_m1 : SDNode<"AArch64ISD::SHL_MERGE_OP1", SDT_AArch64Arith>;
def AArch64lsr_m1 : SDNode<"AArch64ISD::SRL_MERGE_OP1", SDT_AArch64Arith>;
def AArch64asr_m1 : SDNode<"AArch64ISD::SRA_MERGE_OP1", SDT_AArch64Arith>;
def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>;
def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>;
def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;
def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;
def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>;
def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>;
def SDT_IndexVector : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<2>]>;
def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;
def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
let Predicates = [HasSVE] in {
defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>;
def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;
defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>;
defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>;
defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;
defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>;
defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;
defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>;
defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>;
let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;
defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add, null_frag>;
defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub, null_frag>;
defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>;
defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;
defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>;
defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>;
// SVE predicated integer reductions.
defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>;
defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>;
defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>;
defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>;
defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>;
defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>;
defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>;
defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>;
defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>;
defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;
defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>;
defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>;
defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>;
defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>;
defm MUL_ZI : sve_int_arith_imm2<"mul", mul>;
defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>;
defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>;
defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>;
// Add unpredicated alternative for the mul instruction.
def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2),
(MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>;
def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2),
(MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>;
def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2),
(MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>;
def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2),
(MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>;
defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">;
defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">;
defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", "SDIVR_ZPZZ", int_aarch64_sve_sdivr, DestructiveBinaryCommWithRev, "SDIV_ZPmZ", /*isReverseInstr*/ 1>;
defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", "UDIVR_ZPZZ", int_aarch64_sve_udivr, DestructiveBinaryCommWithRev, "UDIV_ZPmZ", /*isReverseInstr*/ 1>;
defm SDIV_ZPZZ : sve_int_bin_pred_sd<AArch64sdiv_p>;
defm UDIV_ZPZZ : sve_int_bin_pred_sd<AArch64udiv_p>;
defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;
defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", int_aarch64_sve_sxtb>;
defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", int_aarch64_sve_uxtb>;
defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", int_aarch64_sve_sxth>;
defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", int_aarch64_sve_uxth>;
defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", int_aarch64_sve_sxtw>;
defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", int_aarch64_sve_uxtw>;
defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>;
defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>;
defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>;
defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>;
defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_3_Op_Pat<nxv8i16, int_aarch64_sve_cnt, nxv8i16, nxv8i1, nxv8bf16, !cast<Instruction>(CNT_ZPmZ_H)>;
defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>;
defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>;
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>;
defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>;
defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>;
defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>;
defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>;
defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>;
defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>;
defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>;
defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>;
defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
defm FMUL_ZPmI : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>;
defm FSUBR_ZPmI : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>;
defm FMAXNM_ZPmI : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>;
defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>;
defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;
defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>;
defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>;
defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>;
defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;
defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>;
defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;
defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>;
defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", /*isReverseInstr*/ 1>;
defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">;
defm FADD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fadd_p>;
let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsubr>;
defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmaxnm>;
defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fminnm>;
defm FMAX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmax>;
defm FMIN_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmin>;
defm FABD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fabd>;
defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>;
defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul>;
defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>;
defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>;
defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>;
defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>;
defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>;
defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>;
defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>;
defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>;
defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>;
defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;
// Add patterns for FMA where disabled lanes are undef.
// FIXME: Implement a pseudo so we can choose a better instruction after
// regalloc.
def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
(FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
(FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
(FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;
defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>;
defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
// SVE floating point reductions.
defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>;
defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>;
defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>;
defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>;
// Use more efficient NEON instructions to extract elements within the NEON
// part (first 128bits) of an SVE register.
def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
(f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
(f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
(f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;
// Splat immediate (unpredicated)
defm DUP_ZI : sve_int_dup_imm<"dup">;
defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;
// Splat immediate (predicated)
defm CPY_ZPmI : sve_int_dup_imm_pred_merge<"cpy">;
defm CPY_ZPzI : sve_int_dup_imm_pred_zero<"cpy">;
defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;
// Splat scalar register (unpredicated, GPR or vector + element index)
defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>;
defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
// Splat scalar register (predicated)
defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;
let Predicates = [HasSVE, HasBF16] in {
def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
(CPY_ZPmV_H $passthru, $pg, $splat)>;
// Duplicate FP scalar into all vector elements
def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))),
def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))),
def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))),
def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))),
def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
let Predicates = [HasSVE, HasBF16] in {
def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
// Duplicate +0.0 into all vector elements
def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
let Predicates = [HasSVE, HasBF16] in {
def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
// Duplicate Int immediate into all vector elements
def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
(DUP_ZI_B $a, $b)>;
def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
(DUP_ZI_H $a, $b)>;
def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
(DUP_ZI_S $a, $b)>;
def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
(DUP_ZI_D $a, $b)>;
// Duplicate FP immediate into all vector elements
let AddedComplexity = 2 in {
def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
(FDUP_ZI_H fpimm16:$imm8)>;
def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)),
(FDUP_ZI_H fpimm16:$imm8)>;
def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)),
(FDUP_ZI_H fpimm16:$imm8)>;
def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)),
(FDUP_ZI_S fpimm32:$imm8)>;
def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)),
(FDUP_ZI_S fpimm32:$imm8)>;
def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)),
(FDUP_ZI_D fpimm64:$imm8)>;
// Select elements from either vector (predicated)
defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;
defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_3_Op_Pat<nxv8bf16, vselect, nxv8i1, nxv8bf16, nxv8bf16, SEL_ZPZZ_H>;
def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_splice, nxv8i1, nxv8bf16, nxv8bf16, SPLICE_ZPZ_H>;
defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;
defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_1_Op_Pat<nxv8bf16, AArch64rev, nxv8bf16, REV_ZZ_H>;
defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>;
defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>;
defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>;
defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>;
defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>;
defm BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs", null_frag>;
defm BRKN_PPzP : sve_int_brkn<0b0, "brkn", int_aarch64_sve_brkn_z>;
defm BRKNS_PPzP : sve_int_brkn<0b1, "brkns", null_frag>;
defm BRKA_PPzP : sve_int_break_z<0b000, "brka", int_aarch64_sve_brka_z>;
defm BRKA_PPmP : sve_int_break_m<0b001, "brka", int_aarch64_sve_brka>;
defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas", null_frag>;
defm BRKB_PPzP : sve_int_break_z<0b100, "brkb", int_aarch64_sve_brkb_z>;
defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>;
defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;
def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
def PFALSE : sve_int_pfalse<0b000000, "pfalse">;
defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>;
defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>;
defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>;
defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>;
defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>;
defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>;
defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>;
defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>;
defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>;
defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>;
defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>;
defm ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs", null_frag>;
defm ORNS_PPzPP : sve_int_pred_log<0b1101, "orns", null_frag>;
defm NORS_PPzPP : sve_int_pred_log<0b1110, "nors", null_frag>;
defm NANDS_PPzPP : sve_int_pred_log<0b1111, "nands", null_frag>;
defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta", AArch64clasta_n>;
defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb", AArch64clastb_n>;
defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta", AArch64clasta_n>;
defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb", AArch64clastb_n>;
defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_3_Op_Pat<bf16, AArch64clasta_n, nxv8i1, bf16, nxv8bf16, CLASTA_VPZ_H>;
def : SVE_3_Op_Pat<bf16, AArch64clastb_n, nxv8i1, bf16, nxv8bf16, CLASTB_VPZ_H>;
def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
// continuous load with reg+immediate
defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>;
defm LD1B_S_IMM : sve_mem_cld_si<0b0010, "ld1b", Z_s, ZPR32>;
defm LD1B_D_IMM : sve_mem_cld_si<0b0011, "ld1b", Z_d, ZPR64>;
defm LD1SW_D_IMM : sve_mem_cld_si<0b0100, "ld1sw", Z_d, ZPR64>;
defm LD1H_IMM : sve_mem_cld_si<0b0101, "ld1h", Z_h, ZPR16>;
defm LD1H_S_IMM : sve_mem_cld_si<0b0110, "ld1h", Z_s, ZPR32>;
defm LD1H_D_IMM : sve_mem_cld_si<0b0111, "ld1h", Z_d, ZPR64>;
defm LD1SH_D_IMM : sve_mem_cld_si<0b1000, "ld1sh", Z_d, ZPR64>;
defm LD1SH_S_IMM : sve_mem_cld_si<0b1001, "ld1sh", Z_s, ZPR32>;
defm LD1W_IMM : sve_mem_cld_si<0b1010, "ld1w", Z_s, ZPR32>;
defm LD1W_D_IMM : sve_mem_cld_si<0b1011, "ld1w", Z_d, ZPR64>;
defm LD1SB_D_IMM : sve_mem_cld_si<0b1100, "ld1sb", Z_d, ZPR64>;
defm LD1SB_S_IMM : sve_mem_cld_si<0b1101, "ld1sb", Z_s, ZPR32>;
defm LD1SB_H_IMM : sve_mem_cld_si<0b1110, "ld1sb", Z_h, ZPR16>;
defm LD1D_IMM : sve_mem_cld_si<0b1111, "ld1d", Z_d, ZPR64>;
// LD1R loads (splat scalar to vector)
defm LD1RB_IMM : sve_mem_ld_dup<0b00, 0b00, "ld1rb", Z_b, ZPR8, uimm6s1>;
defm LD1RB_H_IMM : sve_mem_ld_dup<0b00, 0b01, "ld1rb", Z_h, ZPR16, uimm6s1>;
defm LD1RB_S_IMM : sve_mem_ld_dup<0b00, 0b10, "ld1rb", Z_s, ZPR32, uimm6s1>;
defm LD1RB_D_IMM : sve_mem_ld_dup<0b00, 0b11, "ld1rb", Z_d, ZPR64, uimm6s1>;
defm LD1RSW_IMM : sve_mem_ld_dup<0b01, 0b00, "ld1rsw", Z_d, ZPR64, uimm6s4>;
defm LD1RH_IMM : sve_mem_ld_dup<0b01, 0b01, "ld1rh", Z_h, ZPR16, uimm6s2>;
defm LD1RH_S_IMM : sve_mem_ld_dup<0b01, 0b10, "ld1rh", Z_s, ZPR32, uimm6s2>;
defm LD1RH_D_IMM : sve_mem_ld_dup<0b01, 0b11, "ld1rh", Z_d, ZPR64, uimm6s2>;
defm LD1RSH_D_IMM : sve_mem_ld_dup<0b10, 0b00, "ld1rsh", Z_d, ZPR64, uimm6s2>;
defm LD1RSH_S_IMM : sve_mem_ld_dup<0b10, 0b01, "ld1rsh", Z_s, ZPR32, uimm6s2>;
defm LD1RW_IMM : sve_mem_ld_dup<0b10, 0b10, "ld1rw", Z_s, ZPR32, uimm6s4>;
defm LD1RW_D_IMM : sve_mem_ld_dup<0b10, 0b11, "ld1rw", Z_d, ZPR64, uimm6s4>;
defm LD1RSB_D_IMM : sve_mem_ld_dup<0b11, 0b00, "ld1rsb", Z_d, ZPR64, uimm6s1>;
defm LD1RSB_S_IMM : sve_mem_ld_dup<0b11, 0b01, "ld1rsb", Z_s, ZPR32, uimm6s1>;
defm LD1RSB_H_IMM : sve_mem_ld_dup<0b11, 0b10, "ld1rsb", Z_h, ZPR16, uimm6s1>;
defm LD1RD_IMM : sve_mem_ld_dup<0b11, 0b11, "ld1rd", Z_d, ZPR64, uimm6s8>;
// LD1RQ loads (load quadword-vector and splat to scalable vector)
defm LD1RQ_B_IMM : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8>;
defm LD1RQ_H_IMM : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16>;
defm LD1RQ_W_IMM : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32>;
defm LD1RQ_D_IMM : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64>;
defm LD1RQ_B : sve_mem_ldqr_ss<0b00, "ld1rqb", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm LD1RQ_H : sve_mem_ldqr_ss<0b01, "ld1rqh", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
// continuous load with reg+reg addressing.
defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
defm LD1B_S : sve_mem_cld_ss<0b0010, "ld1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
defm LD1B_D : sve_mem_cld_ss<0b0011, "ld1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
defm LD1SW_D : sve_mem_cld_ss<0b0100, "ld1sw", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm LD1H : sve_mem_cld_ss<0b0101, "ld1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm LD1H_S : sve_mem_cld_ss<0b0110, "ld1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
defm LD1H_D : sve_mem_cld_ss<0b0111, "ld1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
defm LD1SH_D : sve_mem_cld_ss<0b1000, "ld1sh", Z_d, ZPR64, GPR64NoXZRshifted16>;
defm LD1SH_S : sve_mem_cld_ss<0b1001, "ld1sh", Z_s, ZPR32, GPR64NoXZRshifted16>;
defm LD1W : sve_mem_cld_ss<0b1010, "ld1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm LD1W_D : sve_mem_cld_ss<0b1011, "ld1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm LD1SB_D : sve_mem_cld_ss<0b1100, "ld1sb", Z_d, ZPR64, GPR64NoXZRshifted8>;
defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
// non-faulting continuous load with reg+immediate
defm LDNF1B_IMM : sve_mem_cldnf_si<0b0000, "ldnf1b", Z_b, ZPR8>;
defm LDNF1B_H_IMM : sve_mem_cldnf_si<0b0001, "ldnf1b", Z_h, ZPR16>;
defm LDNF1B_S_IMM : sve_mem_cldnf_si<0b0010, "ldnf1b", Z_s, ZPR32>;
defm LDNF1B_D_IMM : sve_mem_cldnf_si<0b0011, "ldnf1b", Z_d, ZPR64>;
defm LDNF1SW_D_IMM : sve_mem_cldnf_si<0b0100, "ldnf1sw", Z_d, ZPR64>;
defm LDNF1H_IMM : sve_mem_cldnf_si<0b0101, "ldnf1h", Z_h, ZPR16>;
defm LDNF1H_S_IMM : sve_mem_cldnf_si<0b0110, "ldnf1h", Z_s, ZPR32>;
defm LDNF1H_D_IMM : sve_mem_cldnf_si<0b0111, "ldnf1h", Z_d, ZPR64>;
defm LDNF1SH_D_IMM : sve_mem_cldnf_si<0b1000, "ldnf1sh", Z_d, ZPR64>;
defm LDNF1SH_S_IMM : sve_mem_cldnf_si<0b1001, "ldnf1sh", Z_s, ZPR32>;
defm LDNF1W_IMM : sve_mem_cldnf_si<0b1010, "ldnf1w", Z_s, ZPR32>;
defm LDNF1W_D_IMM : sve_mem_cldnf_si<0b1011, "ldnf1w", Z_d, ZPR64>;
defm LDNF1SB_D_IMM : sve_mem_cldnf_si<0b1100, "ldnf1sb", Z_d, ZPR64>;
defm LDNF1SB_S_IMM : sve_mem_cldnf_si<0b1101, "ldnf1sb", Z_s, ZPR32>;
defm LDNF1SB_H_IMM : sve_mem_cldnf_si<0b1110, "ldnf1sb", Z_h, ZPR16>;
defm LDNF1D_IMM : sve_mem_cldnf_si<0b1111, "ldnf1d", Z_d, ZPR64>;
// First-faulting loads with reg+reg addressing.
defm LDFF1B : sve_mem_cldff_ss<0b0000, "ldff1b", Z_b, ZPR8, GPR64shifted8>;
defm LDFF1B_H : sve_mem_cldff_ss<0b0001, "ldff1b", Z_h, ZPR16, GPR64shifted8>;
defm LDFF1B_S : sve_mem_cldff_ss<0b0010, "ldff1b", Z_s, ZPR32, GPR64shifted8>;
defm LDFF1B_D : sve_mem_cldff_ss<0b0011, "ldff1b", Z_d, ZPR64, GPR64shifted8>;
defm LDFF1SW_D : sve_mem_cldff_ss<0b0100, "ldff1sw", Z_d, ZPR64, GPR64shifted32>;
defm LDFF1H : sve_mem_cldff_ss<0b0101, "ldff1h", Z_h, ZPR16, GPR64shifted16>;
defm LDFF1H_S : sve_mem_cldff_ss<0b0110, "ldff1h", Z_s, ZPR32, GPR64shifted16>;
defm LDFF1H_D : sve_mem_cldff_ss<0b0111, "ldff1h", Z_d, ZPR64, GPR64shifted16>;
defm LDFF1SH_D : sve_mem_cldff_ss<0b1000, "ldff1sh", Z_d, ZPR64, GPR64shifted16>;
defm LDFF1SH_S : sve_mem_cldff_ss<0b1001, "ldff1sh", Z_s, ZPR32, GPR64shifted16>;
defm LDFF1W : sve_mem_cldff_ss<0b1010, "ldff1w", Z_s, ZPR32, GPR64shifted32>;
defm LDFF1W_D : sve_mem_cldff_ss<0b1011, "ldff1w", Z_d, ZPR64, GPR64shifted32>;
defm LDFF1SB_D : sve_mem_cldff_ss<0b1100, "ldff1sb", Z_d, ZPR64, GPR64shifted8>;
defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>;
defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>;
defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>;
// LD(2|3|4) structured loads with reg+immediate
defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>;
defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>;
defm LD4B_IMM : sve_mem_eld_si<0b00, 0b11, ZZZZ_b, "ld4b", simm4s4>;
defm LD2H_IMM : sve_mem_eld_si<0b01, 0b01, ZZ_h, "ld2h", simm4s2>;
defm LD3H_IMM : sve_mem_eld_si<0b01, 0b10, ZZZ_h, "ld3h", simm4s3>;
defm LD4H_IMM : sve_mem_eld_si<0b01, 0b11, ZZZZ_h, "ld4h", simm4s4>;
defm LD2W_IMM : sve_mem_eld_si<0b10, 0b01, ZZ_s, "ld2w", simm4s2>;
defm LD3W_IMM : sve_mem_eld_si<0b10, 0b10, ZZZ_s, "ld3w", simm4s3>;
defm LD4W_IMM : sve_mem_eld_si<0b10, 0b11, ZZZZ_s, "ld4w", simm4s4>;
defm LD2D_IMM : sve_mem_eld_si<0b11, 0b01, ZZ_d, "ld2d", simm4s2>;
defm LD3D_IMM : sve_mem_eld_si<0b11, 0b10, ZZZ_d, "ld3d", simm4s3>;
defm LD4D_IMM : sve_mem_eld_si<0b11, 0b11, ZZZZ_d, "ld4d", simm4s4>;
// LD(2|3|4) structured loads (register + register)
def LD2B : sve_mem_eld_ss<0b00, 0b01, ZZ_b, "ld2b", GPR64NoXZRshifted8>;
def LD3B : sve_mem_eld_ss<0b00, 0b10, ZZZ_b, "ld3b", GPR64NoXZRshifted8>;
def LD4B : sve_mem_eld_ss<0b00, 0b11, ZZZZ_b, "ld4b", GPR64NoXZRshifted8>;
def LD2H : sve_mem_eld_ss<0b01, 0b01, ZZ_h, "ld2h", GPR64NoXZRshifted16>;
def LD3H : sve_mem_eld_ss<0b01, 0b10, ZZZ_h, "ld3h", GPR64NoXZRshifted16>;
def LD4H : sve_mem_eld_ss<0b01, 0b11, ZZZZ_h, "ld4h", GPR64NoXZRshifted16>;
def LD2W : sve_mem_eld_ss<0b10, 0b01, ZZ_s, "ld2w", GPR64NoXZRshifted32>;
def LD3W : sve_mem_eld_ss<0b10, 0b10, ZZZ_s, "ld3w", GPR64NoXZRshifted32>;
def LD4W : sve_mem_eld_ss<0b10, 0b11, ZZZZ_s, "ld4w", GPR64NoXZRshifted32>;
def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>;
def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>;
def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;
// Gathers using unscaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw]
defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
// Gathers using scaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
// Gathers using 32-bit pointers with scaled offset, e.g.
// ld1h z0.s, p0/z, [z0.s, #16]
defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv4i8>;
defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv4i8>;
defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv4i8>;
defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv4i8>;
defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv4i16>;
defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv4i16>;
defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv4i16>;
defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv4i16>;
defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv4i32>;
defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv4i32>;
// Gathers using 64-bit pointers with scaled offset, e.g.
// ld1h z0.d, p0/z, [z0.d, #16]
defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv2i8>;
defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv2i8>;
defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv2i8>;
defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv2i8>;
defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv2i16>;
defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv2i16>;
defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv2i16>;
defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv2i16>;
defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm_z, nxv2i32>;
defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm_z, nxv2i32>;
defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv2i32>;
defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv2i32>;
defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm_z, nxv2i64>;
defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm_z, nxv2i64>;
// Gathers using unscaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d]
defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_z, nxv2i8>;
defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_z, nxv2i8>;
defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather_z, nxv2i8>;
defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_z, nxv2i8>;
defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_z, nxv2i16>;
defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_z, nxv2i16>;
defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather_z, nxv2i16>;
defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_z, nxv2i16>;
defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_z, nxv2i32>;
defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_z, nxv2i32>;
defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather_z, nxv2i32>;
defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_z, nxv2i32>;
defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>;
defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>;
// Gathers using scaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>;
defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>;
// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
// Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
// Non-temporal contiguous loads (register + immediate)
defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
defm LDNT1W_ZRI : sve_mem_cldnt_si<0b10, "ldnt1w", Z_s, ZPR32>;
defm LDNT1D_ZRI : sve_mem_cldnt_si<0b11, "ldnt1d", Z_d, ZPR64>;
// Non-temporal contiguous loads (register + register)
defm LDNT1B_ZRR : sve_mem_cldnt_ss<0b00, "ldnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm LDNT1H_ZRR : sve_mem_cldnt_ss<0b01, "ldnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm LDNT1W_ZRR : sve_mem_cldnt_ss<0b10, "ldnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm LDNT1D_ZRR : sve_mem_cldnt_ss<0b11, "ldnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
// contiguous store with immediates
defm ST1B_IMM : sve_mem_cst_si<0b00, 0b00, "st1b", Z_b, ZPR8>;
defm ST1B_H_IMM : sve_mem_cst_si<0b00, 0b01, "st1b", Z_h, ZPR16>;
defm ST1B_S_IMM : sve_mem_cst_si<0b00, 0b10, "st1b", Z_s, ZPR32>;
defm ST1B_D_IMM : sve_mem_cst_si<0b00, 0b11, "st1b", Z_d, ZPR64>;
defm ST1H_IMM : sve_mem_cst_si<0b01, 0b01, "st1h", Z_h, ZPR16>;
defm ST1H_S_IMM : sve_mem_cst_si<0b01, 0b10, "st1h", Z_s, ZPR32>;
defm ST1H_D_IMM : sve_mem_cst_si<0b01, 0b11, "st1h", Z_d, ZPR64>;
defm ST1W_IMM : sve_mem_cst_si<0b10, 0b10, "st1w", Z_s, ZPR32>;
defm ST1W_D_IMM : sve_mem_cst_si<0b10, 0b11, "st1w", Z_d, ZPR64>;
defm ST1D_IMM : sve_mem_cst_si<0b11, 0b11, "st1d", Z_d, ZPR64>;
// contiguous store with reg+reg addressing.
defm ST1B : sve_mem_cst_ss<0b0000, "st1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm ST1B_H : sve_mem_cst_ss<0b0001, "st1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
defm ST1B_S : sve_mem_cst_ss<0b0010, "st1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
defm ST1B_D : sve_mem_cst_ss<0b0011, "st1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
defm ST1H : sve_mem_cst_ss<0b0101, "st1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm ST1H_S : sve_mem_cst_ss<0b0110, "st1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
defm ST1H_D : sve_mem_cst_ss<0b0111, "st1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
// Scatters using unpacked, unscaled 32-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, uxtw]
defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>;
defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
// Scatters using packed, unscaled 32-bit offsets, e.g.
// st1h z0.s, p0, [x0, z0.s, uxtw]
defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
// Scatters using packed, scaled 32-bit offsets, e.g.
// st1h z0.s, p0, [x0, z0.s, uxtw #1]
defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
// Scatters using unpacked, scaled 32-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, uxtw #1]
defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
// Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.s, p0, [z0.s, #16]
defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>;
defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>;
defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>;
// Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.d, p0, [z0.d, #16]
defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>;
defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>;
defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>;
defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>;
// Scatters using unscaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d]
defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>;
defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>;
defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>;
defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>;
// Scatters using scaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, lsl #1]
defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;
// ST(2|3|4) structured stores (register + immediate)
defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>;
defm ST4B_IMM : sve_mem_est_si<0b00, 0b11, ZZZZ_b, "st4b", simm4s4>;
defm ST2H_IMM : sve_mem_est_si<0b01, 0b01, ZZ_h, "st2h", simm4s2>;
defm ST3H_IMM : sve_mem_est_si<0b01, 0b10, ZZZ_h, "st3h", simm4s3>;
defm ST4H_IMM : sve_mem_est_si<0b01, 0b11, ZZZZ_h, "st4h", simm4s4>;
defm ST2W_IMM : sve_mem_est_si<0b10, 0b01, ZZ_s, "st2w", simm4s2>;
defm ST3W_IMM : sve_mem_est_si<0b10, 0b10, ZZZ_s, "st3w", simm4s3>;
defm ST4W_IMM : sve_mem_est_si<0b10, 0b11, ZZZZ_s, "st4w", simm4s4>;
defm ST2D_IMM : sve_mem_est_si<0b11, 0b01, ZZ_d, "st2d", simm4s2>;
defm ST3D_IMM : sve_mem_est_si<0b11, 0b10, ZZZ_d, "st3d", simm4s3>;
defm ST4D_IMM : sve_mem_est_si<0b11, 0b11, ZZZZ_d, "st4d", simm4s4>;
// ST(2|3|4) structured stores (register + register)
def ST2B : sve_mem_est_ss<0b00, 0b01, ZZ_b, "st2b", GPR64NoXZRshifted8>;
def ST3B : sve_mem_est_ss<0b00, 0b10, ZZZ_b, "st3b", GPR64NoXZRshifted8>;
def ST4B : sve_mem_est_ss<0b00, 0b11, ZZZZ_b, "st4b", GPR64NoXZRshifted8>;
def ST2H : sve_mem_est_ss<0b01, 0b01, ZZ_h, "st2h", GPR64NoXZRshifted16>;
def ST3H : sve_mem_est_ss<0b01, 0b10, ZZZ_h, "st3h", GPR64NoXZRshifted16>;
def ST4H : sve_mem_est_ss<0b01, 0b11, ZZZZ_h, "st4h", GPR64NoXZRshifted16>;
def ST2W : sve_mem_est_ss<0b10, 0b01, ZZ_s, "st2w", GPR64NoXZRshifted32>;
def ST3W : sve_mem_est_ss<0b10, 0b10, ZZZ_s, "st3w", GPR64NoXZRshifted32>;
def ST4W : sve_mem_est_ss<0b10, 0b11, ZZZZ_s, "st4w", GPR64NoXZRshifted32>;
def ST2D : sve_mem_est_ss<0b11, 0b01, ZZ_d, "st2d", GPR64NoXZRshifted64>;
def ST3D : sve_mem_est_ss<0b11, 0b10, ZZZ_d, "st3d", GPR64NoXZRshifted64>;
def ST4D : sve_mem_est_ss<0b11, 0b11, ZZZZ_d, "st4d", GPR64NoXZRshifted64>;
// Non-temporal contiguous stores (register + immediate)
defm STNT1B_ZRI : sve_mem_cstnt_si<0b00, "stnt1b", Z_b, ZPR8>;
defm STNT1H_ZRI : sve_mem_cstnt_si<0b01, "stnt1h", Z_h, ZPR16>;
defm STNT1W_ZRI : sve_mem_cstnt_si<0b10, "stnt1w", Z_s, ZPR32>;
defm STNT1D_ZRI : sve_mem_cstnt_si<0b11, "stnt1d", Z_d, ZPR64>;
// Non-temporal contiguous stores (register + register)
defm STNT1B_ZRR : sve_mem_cstnt_ss<0b00, "stnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
defm STNT1H_ZRR : sve_mem_cstnt_ss<0b01, "stnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
defm STNT1W_ZRR : sve_mem_cstnt_ss<0b10, "stnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
defm STNT1D_ZRR : sve_mem_cstnt_ss<0b11, "stnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
// Fill/Spill
defm LDR_ZXI : sve_mem_z_fill<"ldr">;
defm LDR_PXI : sve_mem_p_fill<"ldr">;
defm STR_ZXI : sve_mem_z_spill<"str">;
defm STR_PXI : sve_mem_p_spill<"str">;
// Contiguous prefetch (register + immediate)
defm PRFB_PRI : sve_mem_prfm_si<0b00, "prfb">;
defm PRFH_PRI : sve_mem_prfm_si<0b01, "prfh">;
defm PRFW_PRI : sve_mem_prfm_si<0b10, "prfw">;
defm PRFD_PRI : sve_mem_prfm_si<0b11, "prfd">;
// Contiguous prefetch (register + register)
def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
// reg + imm
let AddedComplexity = 2 in {
def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)),
(RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, simm6s1:$offset)>;
// reg + reg
let AddedComplexity = 1 in {
def _reg_reg : Pat<(prefetch (PredTy PPR_3b:$gp), (AddrCP GPR64sp:$base, GPR64:$index), (i32 sve_prfop:$prfop)),
(RegRegInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, GPR64:$index)>;
// default fallback
def _default : Pat<(prefetch (PredTy PPR_3b:$gp), GPR64:$base, (i32 sve_prfop:$prfop)),
(RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>;
defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, 0, am_sve_regreg_lsl0>;
defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1, PRFH_PRI, PRFH_PRR, 1, am_sve_regreg_lsl1>;
defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1, PRFW_PRI, PRFS_PRR, 2, am_sve_regreg_lsl2>;
defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1, PRFD_PRI, PRFD_PRR, 3, am_sve_regreg_lsl3>;
// Gather prefetch using scaled 32-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;
// Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;
// Gather prefetch using scaled 64-bit offsets, e.g.
// prfh pldl1keep, p0, [x0, z0.d, lsl #1]
defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_index>;
defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_index>;
defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_index>;
defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_index>;
// Gather prefetch using 32/64-bit pointers with offset, e.g.
// prfh pldl1keep, p0, [z0.s, #16]
// prfh pldl1keep, p0, [z0.d, #16]
defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;
defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;
defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;
def : Pat<(nxv4i32 (int_aarch64_sve_adrb nxv4i32:$Op1, nxv4i32:$Op2)),
(ADR_LSL_ZZZ_S_0 $Op1, $Op2)>;
def : Pat<(nxv4i32 (int_aarch64_sve_adrh nxv4i32:$Op1, nxv4i32:$Op2)),
(ADR_LSL_ZZZ_S_1 $Op1, $Op2)>;
def : Pat<(nxv4i32 (int_aarch64_sve_adrw nxv4i32:$Op1, nxv4i32:$Op2)),
(ADR_LSL_ZZZ_S_2 $Op1, $Op2)>;
def : Pat<(nxv4i32 (int_aarch64_sve_adrd nxv4i32:$Op1, nxv4i32:$Op2)),
(ADR_LSL_ZZZ_S_3 $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)),
(ADR_LSL_ZZZ_D_0 $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)),
(ADR_LSL_ZZZ_D_1 $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)),
(ADR_LSL_ZZZ_D_2 $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),
(ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_2_Op_Pat<nxv8bf16, AArch64tbl, nxv8bf16, nxv8i16, TBL_ZZZ_H>;
defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2", AArch64uzp2>;
defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>;
defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_2_Op_Pat<nxv8bf16, AArch64zip1, nxv8bf16, nxv8bf16, ZIP1_ZZZ_H>;
def : SVE_2_Op_Pat<nxv8bf16, AArch64zip2, nxv8bf16, nxv8bf16, ZIP2_ZZZ_H>;
def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp1, nxv8bf16, nxv8bf16, UZP1_ZZZ_H>;
def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp2, nxv8bf16, nxv8bf16, UZP2_ZZZ_H>;
def : SVE_2_Op_Pat<nxv8bf16, AArch64trn1, nxv8bf16, nxv8bf16, TRN1_ZZZ_H>;
def : SVE_2_Op_Pat<nxv8bf16, AArch64trn2, nxv8bf16, nxv8bf16, TRN2_ZZZ_H>;
defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>;
defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>;
defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>;
defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2", AArch64uzp2>;
defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
+ // Extract lo/hi halves of legal predicate types.
+ def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
+ (ZIP1_PPP_S PPR:$Ps, (PFALSE))>;
+ def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
+ (ZIP2_PPP_S PPR:$Ps, (PFALSE))>;
+ def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
+ (ZIP1_PPP_H PPR:$Ps, (PFALSE))>;
+ def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
+ (ZIP2_PPP_H PPR:$Ps, (PFALSE))>;
+ def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+ (ZIP1_PPP_B PPR:$Ps, (PFALSE))>;
+ def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+ (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
+ // Concatenate two predicates.
+ def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
+ (UZP1_PPP_S $p1, $p2)>;
+ def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)),
+ (UZP1_PPP_H $p1, $p2)>;
+ def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)),
+ (UZP1_PPP_B $p1, $p2)>;
defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", SETGT, SETLT>;
defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", SETEQ, SETEQ>;
defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", SETNE, SETNE>;
defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>;
defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>;
defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge", int_aarch64_sve_cmpge_wide>;
defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt", int_aarch64_sve_cmpgt_wide>;
defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt", int_aarch64_sve_cmplt_wide>;
defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple", int_aarch64_sve_cmple_wide>;
defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs", int_aarch64_sve_cmphs_wide>;
defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi", int_aarch64_sve_cmphi_wide>;
defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>;
defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>;
defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, SETLE>;
defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, SETLT>;
defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, SETGT>;
defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, SETGE>;
defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, SETEQ>;
defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, SETEQ>;
defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, SETULE>;
defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, SETULT>;
defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;
defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>;
defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>;
defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>;
defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>;
defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>;
defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;
defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels", int_aarch64_sve_whilels>;
defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele", int_aarch64_sve_whilele>;
defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels", int_aarch64_sve_whilels>;
def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>;
def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>;
def CTERMEQ_XX : sve_int_cterm<0b1, 0b0, "ctermeq", GPR64>;
def CTERMNE_XX : sve_int_cterm<0b1, 0b1, "ctermne", GPR64>;
def RDVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdvl">;
def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;
defm CNTB_XPiI : sve_int_count<0b000, "cntb", int_aarch64_sve_cntb>;
defm CNTH_XPiI : sve_int_count<0b010, "cnth", int_aarch64_sve_cnth>;
defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">;
defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">;
defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">;
defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">;
defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;
defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>;
defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>;
defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>;
defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb", int_aarch64_sve_uqdecb_n32>;
defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb", int_aarch64_sve_sqincb_n64>;
defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb", int_aarch64_sve_uqincb_n64>;
defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb", int_aarch64_sve_sqdecb_n64>;
defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb", int_aarch64_sve_uqdecb_n64>;
defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch", int_aarch64_sve_sqinch_n32>;
defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch", int_aarch64_sve_uqinch_n32>;
defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech", int_aarch64_sve_sqdech_n32>;
defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech", int_aarch64_sve_uqdech_n32>;
defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch", int_aarch64_sve_sqinch_n64>;
defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch", int_aarch64_sve_uqinch_n64>;
defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech", int_aarch64_sve_sqdech_n64>;
defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech", int_aarch64_sve_uqdech_n64>;
defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw", int_aarch64_sve_sqincw_n32>;
defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw", int_aarch64_sve_uqincw_n32>;
defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw", int_aarch64_sve_sqdecw_n32>;
defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw", int_aarch64_sve_uqdecw_n32>;
defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw", int_aarch64_sve_sqincw_n64>;
defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw", int_aarch64_sve_uqincw_n64>;
defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw", int_aarch64_sve_sqdecw_n64>;
defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw", int_aarch64_sve_uqdecw_n64>;
defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd", int_aarch64_sve_sqincd_n32>;
defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd", int_aarch64_sve_uqincd_n32>;
defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd", int_aarch64_sve_sqdecd_n32>;
defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd", int_aarch64_sve_uqdecd_n32>;
defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd", int_aarch64_sve_sqincd_n64>;
defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd", int_aarch64_sve_uqincd_n64>;
defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd", int_aarch64_sve_sqdecd_n64>;
defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd", int_aarch64_sve_uqdecd_n64>;
defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16, int_aarch64_sve_sqinch, nxv8i16>;
defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16, int_aarch64_sve_uqinch, nxv8i16>;
defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16, int_aarch64_sve_sqdech, nxv8i16>;
defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16, int_aarch64_sve_uqdech, nxv8i16>;
defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>;
defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>;
defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32, int_aarch64_sve_sqincw, nxv4i32>;
defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32, int_aarch64_sve_uqincw, nxv4i32>;
defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32, int_aarch64_sve_sqdecw, nxv4i32>;
defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32, int_aarch64_sve_uqdecw, nxv4i32>;
defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>;
defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>;
defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64, int_aarch64_sve_sqincd, nxv2i64>;
defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64, int_aarch64_sve_uqincd, nxv2i64>;
defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64, int_aarch64_sve_sqdecd, nxv2i64>;
defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64, int_aarch64_sve_uqdecd, nxv2i64>;
defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>;
defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>;
defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp", int_aarch64_sve_sqincp_n32>;
defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp", int_aarch64_sve_sqincp_n64>;
defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp", int_aarch64_sve_uqincp_n32>;
defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp", int_aarch64_sve_uqincp_n64>;
defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp", int_aarch64_sve_sqdecp_n32>;
defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp", int_aarch64_sve_sqdecp_n64>;
defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp", int_aarch64_sve_uqdecp_n32>;
defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp", int_aarch64_sve_uqdecp_n64>;
defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">;
defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">;
defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp", int_aarch64_sve_sqincp>;
defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp", int_aarch64_sve_uqincp>;
defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp", int_aarch64_sve_sqdecp>;
defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp", int_aarch64_sve_uqdecp>;
defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
defm DECP_ZP : sve_int_count_v<0b10100, "decp">;
defm INDEX_RR : sve_int_index_rr<"index", index_vector>;
defm INDEX_IR : sve_int_index_ir<"index", index_vector>;
defm INDEX_RI : sve_int_index_ri<"index", index_vector>;
defm INDEX_II : sve_int_index_ii<"index", index_vector>;
// Unpredicated shifts
defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>;
defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>;
defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>;
defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
// Predicated shifts
defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">;
defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">;
defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64asr_m1>;
defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsr_m1>;
defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<AArch64lsl_m1>;
defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">;
defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">;
defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">;
defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>;
defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>;
defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>;
defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", int_aarch64_sve_frintm>;
defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", int_aarch64_sve_frintz>;
defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", int_aarch64_sve_frinta>;
defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", int_aarch64_sve_frintx>;
defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", int_aarch64_sve_frinti>;
defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>;
let Predicates = [HasBF16, HasSVE] in {
defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>;
defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
// InstAliases
def : InstAlias<"mov $Zd, $Zn",
(ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
def : InstAlias<"mov $Pd, $Pg/m, $Pn",
(SEL_PPPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pd), 1>;
def : InstAlias<"mov $Pd, $Pn",
(ORR_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
def : InstAlias<"mov $Pd, $Pg/z, $Pn",
(AND_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;
def : InstAlias<"movs $Pd, $Pn",
(ORRS_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
def : InstAlias<"movs $Pd, $Pg/z, $Pn",
(ANDS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;
def : InstAlias<"not $Pd, $Pg/z, $Pn",
(EOR_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;
def : InstAlias<"nots $Pd, $Pg/z, $Pn",
(EORS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;
def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
(CMPGE_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
(CMPGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
(CMPGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
(CMPGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
(CMPHI_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
(CMPHI_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
(CMPHI_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
(CMPHI_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
(CMPHS_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
(CMPHS_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
(CMPHS_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
(CMPHS_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
(CMPGT_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
(CMPGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
(CMPGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
(CMPGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
(FACGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
(FACGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
(FACGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
(FACGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
(FACGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
(FACGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
(FCMGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
(FCMGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
(FCMGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
// Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4.
// These get expanded to individual LDR_ZXI/STR_ZXI instructions in
// AArch64ExpandPseudoInsts.
let mayLoad = 1, hasSideEffects = 0 in {
def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
let mayStore = 1, hasSideEffects = 0 in {
def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)),
(PTEST_PP PPR:$pg, PPR:$src)>;
def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)),
(PTEST_PP PPR:$pg, PPR:$src)>;
def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)),
(PTEST_PP PPR:$pg, PPR:$src)>;
def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
(PTEST_PP PPR:$pg, PPR:$src)>;
// LD1R of 128-bit masked data
def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
(LD1RQ_B_IMM $gp, $base, (i64 0))>;
def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
(LD1RQ_H_IMM $gp, $base, (i64 0))>;
def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
(LD1RQ_W_IMM $gp, $base, (i64 0))>;
def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
(LD1RQ_D_IMM $gp, $base, (i64 0))>;
def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
(LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>;
def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
(LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>;
def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
(LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>;
def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
(LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;
def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;
// General case that we ideally never want to match.
def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>;
let AddedComplexity = 5 in {
def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>;
def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>;
def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>;
def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>;
def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>;
def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>;
def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>;
def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>;
def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
// FIXME: BigEndian requires an additional REV instruction to satisfy the
// constraint that none of the bits change when stored to memory as one
// type, and and reloaded as another type.
let Predicates = [IsLE] in {
def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
let Predicates = [IsLE, HasBF16, HasSVE] in {
def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
let Predicates = [IsLE, HasSVE, HasBF16] in {
def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
(AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
(AND_PPzPP (PTRUE_H 31), PPR:$Ps1, PPR:$Ps2)>;
def : Pat<(nxv4i1 (and PPR:$Ps1, PPR:$Ps2)),
(AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>;
def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)),
(AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>;
// Add more complex addressing modes here as required
multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
// reg + reg
let AddedComplexity = 1 in {
def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
(RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>;
// reg + imm
let AddedComplexity = 2 in {
def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
(RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>;
def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))),
(RegImmInst PPR:$gp, GPR64:$base, (i64 0))>;
// 2-element contiguous loads
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
// 4-element contiguous loads
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous loads
defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
let Predicates = [HasBF16, HasSVE] in {
defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
// 16-element contiguous loads
defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
// reg + reg
let AddedComplexity = 1 in {
def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
(RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
// reg + imm
let AddedComplexity = 2 in {
def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
// 2-element contiguous stores
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
// 4-element contiguous stores
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous stores
defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
let Predicates = [HasBF16, HasSVE] in {
defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
// 16-element contiguous stores
defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRR, LDNT1B_ZRI, am_sve_regreg_lsl0>;
defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRR, LDNT1H_ZRI, am_sve_regreg_lsl1>;
defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRR, LDNT1W_ZRI, am_sve_regreg_lsl2>;
defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRR, LDNT1D_ZRI, am_sve_regreg_lsl3>;
defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRR, STNT1B_ZRI, am_sve_regreg_lsl0>;
defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRR, STNT1H_ZRI, am_sve_regreg_lsl1>;
defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;
multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegImmInst,
Instruction PTrue> {
let AddedComplexity = 1 in {
def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
let AddedComplexity = 2 in {
def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
def : Pat<(Store (Ty ZPR:$val), GPR64:$base),
(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
defm : unpred_store< store, nxv16i8, ST1B_IMM, PTRUE_B>;
defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>;
defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>;
defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv8i16, ST1H_IMM, PTRUE_H>;
defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S_IMM, PTRUE_S>;
defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv4i32, ST1W_IMM, PTRUE_S>;
defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv2i64, ST1D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv8f16, ST1H_IMM, PTRUE_H>;
defm : unpred_store< store, nxv8bf16, ST1H_IMM, PTRUE_H>;
defm : unpred_store< store, nxv4f16, ST1H_S_IMM, PTRUE_S>;
defm : unpred_store< store, nxv2f16, ST1H_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv4f32, ST1W_IMM, PTRUE_S>;
defm : unpred_store< store, nxv4f32, ST1W_D_IMM, PTRUE_D>;
defm : unpred_store< store, nxv2f64, ST1D_IMM, PTRUE_D>;
multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst,
Instruction PTrue> {
let AddedComplexity = 1 in {
def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
let AddedComplexity = 2 in {
def _fi : Pat<(Ty (Load (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
def : Pat<(Ty (Load GPR64:$base)),
(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
defm : unpred_load< load, nxv16i8, LD1B_IMM, PTRUE_B>;
defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
defm : unpred_load< extloadvi8, nxv8i16, LD1B_H_IMM, PTRUE_H>;
defm : unpred_load< extloadvi8, nxv4i32, LD1B_S_IMM, PTRUE_S>;
defm : unpred_load< extloadvi8, nxv2i64, LD1B_D_IMM, PTRUE_D>;
defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>;
defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>;
defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv8i16, LD1H_IMM, PTRUE_H>;
defm : unpred_load<zextloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
defm : unpred_load<zextloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
defm : unpred_load< extloadvi16, nxv4i32, LD1H_S_IMM, PTRUE_S>;
defm : unpred_load< extloadvi16, nxv2i64, LD1H_D_IMM, PTRUE_D>;
defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S_IMM, PTRUE_S>;
defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv4i32, LD1W_IMM, PTRUE_S>;
defm : unpred_load<zextloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
defm : unpred_load< extloadvi32, nxv2i64, LD1W_D_IMM, PTRUE_D>;
defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv2i64, LD1D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv8f16, LD1H_IMM, PTRUE_H>;
defm : unpred_load< load, nxv8bf16, LD1H_IMM, PTRUE_H>;
defm : unpred_load< load, nxv4f16, LD1H_S_IMM, PTRUE_S>;
defm : unpred_load< load, nxv2f16, LD1H_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv4f32, LD1W_IMM, PTRUE_S>;
defm : unpred_load< load, nxv2f32, LD1W_D_IMM, PTRUE_D>;
defm : unpred_load< load, nxv2f64, LD1D_IMM, PTRUE_D>;
multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
(Store PPR:$val, GPR64sp:$base, simm9:$offset)>;
def _default : Pat<(store (Ty PPR:$Val), GPR64:$base),
(Store PPR:$Val, GPR64:$base, (i64 0))>;
defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>;
defm Pat_Store_P8 : unpred_store_predicate<nxv8i1, STR_PXI>;
defm Pat_Store_P4 : unpred_store_predicate<nxv4i1, STR_PXI>;
defm Pat_Store_P2 : unpred_store_predicate<nxv2i1, STR_PXI>;
multiclass unpred_load_predicate<ValueType Ty, Instruction Load> {
def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))),
(Load GPR64sp:$base, simm9:$offset)>;
def _default : Pat<(Ty (load GPR64:$base)),
(Load GPR64:$base, (i64 0))>;
defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>;
defm Pat_Load_P8 : unpred_load_predicate<nxv8i1, LDR_PXI>;
defm Pat_Load_P4 : unpred_load_predicate<nxv4i1, LDR_PXI>;
defm Pat_Load_P2 : unpred_load_predicate<nxv2i1, LDR_PXI>;
multiclass ld1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
// reg + reg
let AddedComplexity = 1 in {
def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
(RegRegInst PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
// scalar + immediate (mul vl)
let AddedComplexity = 2 in {
def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
(RegImmInst PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
// base
def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
(RegImmInst PPR:$gp, GPR64sp:$base, (i64 0))>;
// 2-element contiguous loads
defm : ld1<LD1B_D, LD1B_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
defm : ld1<LD1SB_D, LD1SB_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
defm : ld1<LD1H_D, LD1H_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
defm : ld1<LD1SH_D, LD1SH_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
defm : ld1<LD1W_D, LD1W_D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
defm : ld1<LD1SW_D, LD1SW_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
defm : ld1<LD1D, LD1D_IMM, nxv2i64, AArch64ld1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
defm : ld1<LD1D, LD1D_IMM, nxv2f64, AArch64ld1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
// 4-element contiguous loads
defm : ld1<LD1B_S, LD1B_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
defm : ld1<LD1SB_S, LD1SB_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
defm : ld1<LD1H_S, LD1H_S_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
defm : ld1<LD1SH_S, LD1SH_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
defm : ld1<LD1W, LD1W_IMM, nxv4i32, AArch64ld1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
defm : ld1<LD1W, LD1W_IMM, nxv4f32, AArch64ld1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
// 8-element contiguous loads
defm : ld1<LD1B_H, LD1B_H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
let Predicates = [HasBF16, HasSVE] in {
defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
// 16-element contiguous loads
defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
// scalar + immediate (mul vl)
let AddedComplexity = 1 in {
def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
(I PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
// base
def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
(I PPR:$gp, GPR64sp:$base, (i64 0))>;
// 2-element contiguous non-faulting loads
defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i8>;
defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i8>;
defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i16>;
defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i16>;
defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i32>;
defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s_z, nxv2i1, nxv2i32>;
defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1_z, nxv2i1, nxv2i64>;
defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1_z, nxv2i1, nxv2f64>;
// 4-element contiguous non-faulting loads
defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i8>;
defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i8>;
defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i16>;
defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s_z, nxv4i1, nxv4i16>;
defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1_z, nxv4i1, nxv4i32>;
defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1_z, nxv4i1, nxv4f32>;
// 8-element contiguous non-faulting loads
defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i8>;
defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s_z, nxv8i1, nxv8i8>;
defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i16>;
defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1_z, nxv8i1, nxv8f16>;
let Predicates = [HasBF16, HasSVE] in {
defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>;
// 16-element contiguous non-faulting loads
defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1_z, nxv16i1, nxv16i8>;
multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
// reg + reg
let AddedComplexity = 1 in {
def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
(I PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
// Base
def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
(I PPR:$gp, GPR64sp:$base, XZR)>;
// 2-element contiguous first faulting loads
defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1_z, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1_z, nxv2i1, nxv2f32, am_sve_regreg_lsl2>;
defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1_z, nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
// 4-element contiguous first faulting loads
defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1_z, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1_z, nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
// 8-element contiguous first faulting loads
defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>;
let Predicates = [HasBF16, HasSVE] in {
defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
// 16-element contiguous first faulting loads
defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
// reg + reg
let AddedComplexity = 1 in {
def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT),
(RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
// scalar + immediate (mul vl)
let AddedComplexity = 2 in {
def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT),
(RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
// base
def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT),
(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
// 2-element contiguous store
defm : st1<ST1B_D, ST1B_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i8, am_sve_regreg_lsl0>;
defm : st1<ST1H_D, ST1H_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
defm : st1<ST1W_D, ST1W_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
defm : st1<ST1D, ST1D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
// 4-element contiguous store
defm : st1<ST1B_S, ST1B_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i8, am_sve_regreg_lsl0>;
defm : st1<ST1H_S, ST1H_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
defm : st1<ST1W, ST1W_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
// 8-element contiguous store
defm : st1<ST1B_H, ST1B_H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i8, am_sve_regreg_lsl0>;
defm : st1<ST1H, ST1H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
// 16-element contiguous store
defm : st1<ST1B, ST1B_IMM, nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
(INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
(INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
(INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
(INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
// Insert scalar into vector[0]
def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
(CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)),
(CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>;
def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)),
(CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>;
def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)),
(CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>;
def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)),
(SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)),
(SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>;
def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)),
(SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>;
// Insert scalar into vector with scalar index
def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)),
(CPY_ZPmR_B ZPR:$vec,
(INDEX_II_B 0, 1),
(DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)),
(CPY_ZPmR_H ZPR:$vec,
(INDEX_II_H 0, 1),
(DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)),
(CPY_ZPmR_S ZPR:$vec,
(INDEX_II_S 0, 1),
(DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)),
(CPY_ZPmR_D ZPR:$vec,
(INDEX_II_D 0, 1),
(DUP_ZR_D GPR64:$index)),
// Insert FP scalar into vector with scalar index
def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
(CPY_ZPmV_H ZPR:$vec,
(INDEX_II_H 0, 1),
(DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
(CPY_ZPmV_S ZPR:$vec,
(INDEX_II_S 0, 1),
(DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)),
(CPY_ZPmV_D ZPR:$vec,
(INDEX_II_D 0, 1),
(DUP_ZR_D $index)),
// Extract element from vector with immediate index
def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
(EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
(EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
(EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
(EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
// Extract element from vector with scalar index
def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
let Predicates = [HasSVE, HasMatMulInt8] in {
defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
defm UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>;
defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
let Predicates = [HasSVE, HasMatMulFP32] in {
defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>;
let Predicates = [HasSVE, HasMatMulFP64] in {
defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>;
defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro_z>;
defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro_z>;
defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro_z>;
defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro_z>;
defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8, nxv16i8, nxv16i1, AArch64ld1ro_z, am_sve_regreg_lsl0>;
defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1, AArch64ld1ro_z, am_sve_regreg_lsl1>;
defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1, AArch64ld1ro_z, am_sve_regreg_lsl2>;
defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>;
defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in {
def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip1q, nxv8bf16, nxv8bf16, ZIP1_ZZZ_Q>;
def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip2q, nxv8bf16, nxv8bf16, ZIP2_ZZZ_Q>;
def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp1q, nxv8bf16, nxv8bf16, UZP1_ZZZ_Q>;
def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp2q, nxv8bf16, nxv8bf16, UZP2_ZZZ_Q>;
def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn1q, nxv8bf16, nxv8bf16, TRN1_ZZZ_Q>;
def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn2q, nxv8bf16, nxv8bf16, TRN2_ZZZ_Q>;
let Predicates = [HasSVE2] in {
// SVE2 integer multiply-add (indexed)
defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;
// SVE2 saturating multiply-add high (indexed)
defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>;
defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh_lane>;
// SVE2 saturating multiply-add high (vectors, unpredicated)
defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah>;
defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>;
// SVE2 integer multiply (indexed)
defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>;
// SVE2 saturating multiply high (indexed)
defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh", int_aarch64_sve_sqdmulh_lane>;
defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh", int_aarch64_sve_sqrdmulh_lane>;
// SVE2 signed saturating doubling multiply high (unpredicated)
defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh", int_aarch64_sve_sqdmulh>;
defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>;
// SVE2 integer multiply vectors (unpredicated)
defm MUL_ZZZ : sve2_int_mul<0b000, "mul", mul>;
defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>;
defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>;
defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>;
// Add patterns for unpredicated version of smulh and umulh.
def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
(SMULH_ZZZ_B $Op1, $Op2)>;
def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
(SMULH_ZZZ_H $Op1, $Op2)>;
def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
(SMULH_ZZZ_S $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
(SMULH_ZZZ_D $Op1, $Op2)>;
def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
(UMULH_ZZZ_B $Op1, $Op2)>;
def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
(UMULH_ZZZ_H $Op1, $Op2)>;
def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
(UMULH_ZZZ_S $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
(UMULH_ZZZ_D $Op1, $Op2)>;
// SVE2 complex integer dot product (indexed)
defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;
// SVE2 complex integer dot product
defm CDOT_ZZZ : sve2_cintx_dot<"cdot", int_aarch64_sve_cdot>;
// SVE2 complex integer multiply-add (indexed)
defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla", int_aarch64_sve_cmla_lane_x>;
// SVE2 complex saturating multiply-add (indexed)
defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_lane_x>;
// SVE2 complex integer multiply-add
defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla", int_aarch64_sve_cmla_x>;
defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>;
// SVE2 integer multiply long (indexed)
defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>;
defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>;
defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>;
defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>;
// SVE2 saturating multiply (indexed)
defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>;
defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>;
// SVE2 integer multiply-add long (indexed)
defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>;
defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt", int_aarch64_sve_smlalt_lane>;
defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb", int_aarch64_sve_umlalb_lane>;
defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt", int_aarch64_sve_umlalt_lane>;
defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb", int_aarch64_sve_smlslb_lane>;
defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt", int_aarch64_sve_smlslt_lane>;
defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb", int_aarch64_sve_umlslb_lane>;
defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt", int_aarch64_sve_umlslt_lane>;
// SVE2 integer multiply-add long (vectors, unpredicated)
defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb", int_aarch64_sve_smlalb>;
defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt", int_aarch64_sve_smlalt>;
defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb", int_aarch64_sve_umlalb>;
defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt", int_aarch64_sve_umlalt>;
defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb", int_aarch64_sve_smlslb>;
defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt", int_aarch64_sve_smlslt>;
defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>;
defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>;
// SVE2 saturating multiply-add long (indexed)
defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>;
defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>;
defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb", int_aarch64_sve_sqdmlslb_lane>;
defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt", int_aarch64_sve_sqdmlslt_lane>;
// SVE2 saturating multiply-add long (vectors, unpredicated)
defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb", int_aarch64_sve_sqdmlalb>;
defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt", int_aarch64_sve_sqdmlalt>;
defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb", int_aarch64_sve_sqdmlslb>;
defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt", int_aarch64_sve_sqdmlslt>;
// SVE2 saturating multiply-add interleaved long
defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt", int_aarch64_sve_sqdmlalbt>;
defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>;
// SVE2 integer halving add/subtract (predicated)
defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>;
defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>;
defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>;
defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>;
defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>;
defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>;
defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>;
defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>;
// SVE2 integer pairwise add and accumulate long
defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>;
defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>;
// SVE2 integer pairwise arithmetic
defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp", int_aarch64_sve_addp>;
defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>;
defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>;
defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>;
defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>;
// SVE2 integer unary operations (predicated)
defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe", int_aarch64_sve_urecpe>;
defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte", int_aarch64_sve_ursqrte>;
defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs", int_aarch64_sve_sqabs>;
defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg", int_aarch64_sve_sqneg>;
// SVE2 saturating add/subtract
defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd", int_aarch64_sve_sqadd>;
defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd", int_aarch64_sve_uqadd>;
defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub", int_aarch64_sve_sqsub>;
defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub", int_aarch64_sve_uqsub>;
defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", int_aarch64_sve_suqadd>;
defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", int_aarch64_sve_usqadd>;
defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", int_aarch64_sve_sqsubr>;
defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>;
// SVE2 saturating/rounding bitwise shift left (predicated)
defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl>;
defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl>;
defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag>;
defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag>;
defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl>;
defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl>;
defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl>;
defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl>;
defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag>;
defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag>;
defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>;
defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>;
let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in {
defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
// SVE2 predicated shifts
defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>;
defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>;
defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
// SVE2 integer add/subtract long
defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>;
defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>;
defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>;
defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>;
defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>;
defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>;
defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt", int_aarch64_sve_usublt>;
defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb", int_aarch64_sve_sabdlb>;
defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt", int_aarch64_sve_sabdlt>;
defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb", int_aarch64_sve_uabdlb>;
defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>;
// SVE2 integer add/subtract wide
defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>;
defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>;
defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>;
defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>;
defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>;
defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>;
defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;
// SVE2 integer multiply long
defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;
defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb", int_aarch64_sve_smullb>;
defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt", int_aarch64_sve_smullt>;
defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb", int_aarch64_sve_umullb>;
defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt", int_aarch64_sve_umullt>;
defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb", int_aarch64_sve_pmullb_pair>;
defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>;
// SVE2 bitwise shift and insert
defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>;
defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>;
// SVE2 bitwise shift right and accumulate
defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>;
defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>;
defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>;
defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>;
// SVE2 complex integer add
defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>;
defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>;
// SVE2 integer absolute difference and accumulate
defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>;
defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>;
// SVE2 integer absolute difference and accumulate long
defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>;
defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt", int_aarch64_sve_sabalt>;
defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb", int_aarch64_sve_uabalb>;
defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt", int_aarch64_sve_uabalt>;
// SVE2 integer add/subtract long with carry
defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb", int_aarch64_sve_adclb>;
defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt", int_aarch64_sve_adclt>;
defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb", int_aarch64_sve_sbclb>;
defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt", int_aarch64_sve_sbclt>;
// SVE2 bitwise shift right narrow (bottom)
defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>;
defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb", int_aarch64_sve_sqrshrunb>;
defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb", int_aarch64_sve_shrnb>;
defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb", int_aarch64_sve_rshrnb>;
defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb", int_aarch64_sve_sqshrnb>;
defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb", int_aarch64_sve_sqrshrnb>;
defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb", int_aarch64_sve_uqshrnb>;
defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb", int_aarch64_sve_uqrshrnb>;
// SVE2 bitwise shift right narrow (top)
defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt", int_aarch64_sve_sqshrunt>;
defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt", int_aarch64_sve_sqrshrunt>;
defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt", int_aarch64_sve_shrnt>;
defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt", int_aarch64_sve_rshrnt>;
defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt", int_aarch64_sve_sqshrnt>;
defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt", int_aarch64_sve_sqrshrnt>;
defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt", int_aarch64_sve_uqshrnt>;
defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt", int_aarch64_sve_uqrshrnt>;
// SVE2 integer add/subtract narrow high part (bottom)
defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb", int_aarch64_sve_addhnb>;
defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb", int_aarch64_sve_raddhnb>;
defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb", int_aarch64_sve_subhnb>;
defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb", int_aarch64_sve_rsubhnb>;
// SVE2 integer add/subtract narrow high part (top)
defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt", int_aarch64_sve_addhnt>;
defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt", int_aarch64_sve_raddhnt>;
defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt", int_aarch64_sve_subhnt>;
defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt", int_aarch64_sve_rsubhnt>;
// SVE2 saturating extract narrow (bottom)
defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb", int_aarch64_sve_sqxtnb>;
defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb", int_aarch64_sve_uqxtnb>;
defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb", int_aarch64_sve_sqxtunb>;
// SVE2 saturating extract narrow (top)
defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt", int_aarch64_sve_sqxtnt>;
defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt", int_aarch64_sve_uqxtnt>;
defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;
// SVE2 character match
defm MATCH_PPzZZ : sve2_char_match<0b0, "match", int_aarch64_sve_match>;
defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;
// SVE2 bitwise exclusive-or interleaved
defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;
// SVE2 bitwise shift left long
defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>;
defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>;
defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>;
defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>;
// SVE2 integer add/subtract interleaved long
defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;
// SVE2 histogram generation (segment)
def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>;
// SVE2 histogram generation (vector)
defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;
// SVE2 floating-point base 2 logarithm as integer
defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;
// SVE2 floating-point convert precision
defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">;
defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx">;
defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt", "int_aarch64_sve_fcvtnt">;
defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">;
// SVE2 floating-point pairwise operations
defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp", int_aarch64_sve_faddp>;
defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp", int_aarch64_sve_fmaxnmp>;
defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp", int_aarch64_sve_fminnmp>;
defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp", int_aarch64_sve_fmaxp>;
defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp", int_aarch64_sve_fminp>;
// SVE2 floating-point multiply-add long (indexed)
defm FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb", int_aarch64_sve_fmlalb_lane>;
defm FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt", int_aarch64_sve_fmlalt_lane>;
defm FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb", int_aarch64_sve_fmlslb_lane>;
defm FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt", int_aarch64_sve_fmlslt_lane>;
// SVE2 floating-point multiply-add long
defm FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb", int_aarch64_sve_fmlalb>;
defm FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt", int_aarch64_sve_fmlalt>;
defm FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb", int_aarch64_sve_fmlslb>;
defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>;
// SVE2 bitwise ternary operations
defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>;
defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>;
defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>;
defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>;
// SVE2 bitwise xor and rotate right by immediate
defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
// SVE2 extract vector (immediate offset, constructive)
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
// SVE2 non-temporal gather loads
defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>;
defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather_z, nxv4i8>;
defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv4i16>;
defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather_z, nxv4i16>;
defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather_z, nxv4i32>;
defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv2i8>;
defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather_z, nxv2i8>;
defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv2i16>;
defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather_z, nxv2i16>;
defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>;
defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather_z, nxv2i32>;
defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>;
// SVE2 vector splice (constructive)
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
// SVE2 non-temporal scatter stores
defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;
defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
// SVE2 table lookup (three sources)
defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;
let Predicates = [HasSVE, HasBF16] in {
def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_tbx, nxv8bf16, nxv8bf16, nxv8i16, TBX_ZZZ_H>;
def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
(nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1),
// SVE2 integer compare scalar count and limit
defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>;
defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs", int_aarch64_sve_whilehs>;
defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;
defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege", int_aarch64_sve_whilege>;
defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs", int_aarch64_sve_whilehs>;
defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;
// SVE2 pointer conflict compare
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
let Predicates = [HasSVE2AES] in {
// SVE2 crypto destructive binary operations
defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;
// SVE2 crypto unary operations
defm AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc", int_aarch64_sve_aesmc>;
defm AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc", int_aarch64_sve_aesimc>;
// PMULLB and PMULLT instructions which operate with 64-bit source and
// 128-bit destination elements are enabled with crypto extensions, similar
// to NEON PMULL2 instruction.
defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>;
defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>;
let Predicates = [HasSVE2SM4] in {
// SVE2 crypto constructive binary operations
defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>;
// SVE2 crypto destructive binary operations
defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>;
let Predicates = [HasSVE2SHA3] in {
// SVE2 crypto constructive binary operations
defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>;
let Predicates = [HasSVE2BitPerm] in {
// SVE2 bitwise permute
defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>;
defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>;
defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>;
diff --git a/llvm/lib/Target/AArch64/ b/llvm/lib/Target/AArch64/
index a005d1e65abe..c56a65b9e212 100644
--- a/llvm/lib/Target/AArch64/
+++ b/llvm/lib/Target/AArch64/
@@ -1,7831 +1,7835 @@
//=-- - AArch64 SVE Instruction classes -*- tablegen -*--=//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// AArch64 Scalable Vector Extension (SVE) Instruction Class Definitions.
def SDT_AArch64Setcc : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
SDTCVecEltisVT<0, i1>, SDTCVecEltisVT<1, i1>, SDTCisSameAs<2, 3>,
SDTCisVT<4, OtherVT>
def AArch64setcc_z : SDNode<"AArch64ISD::SETCC_MERGE_ZERO", SDT_AArch64Setcc>;
def SVEPatternOperand : AsmOperandClass {
let Name = "SVEPattern";
let ParserMethod = "tryParseSVEPattern";
let PredicateMethod = "isSVEPattern";
let RenderMethod = "addImmOperands";
let DiagnosticType = "InvalidSVEPattern";
def sve_pred_enum : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) < 32);
}]> {
let PrintMethod = "printSVEPattern";
let ParserMatchClass = SVEPatternOperand;
def SVEPrefetchOperand : AsmOperandClass {
let Name = "SVEPrefetch";
let ParserMethod = "tryParsePrefetch<true>";
let PredicateMethod = "isPrefetch";
let RenderMethod = "addPrefetchOperands";
def sve_prfop : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) <= 15);
}]> {
let PrintMethod = "printPrefetchOp<true>";
let ParserMatchClass = SVEPrefetchOperand;
class SVELogicalImmOperand<int Width> : AsmOperandClass {
let Name = "SVELogicalImm" # Width;
let DiagnosticType = "LogicalSecondSource";
let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
def sve_logical_imm8 : Operand<i64> {
let ParserMatchClass = SVELogicalImmOperand<8>;
let PrintMethod = "printLogicalImm<int8_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int8_t>(Val);
def sve_logical_imm16 : Operand<i64> {
let ParserMatchClass = SVELogicalImmOperand<16>;
let PrintMethod = "printLogicalImm<int16_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val);
def sve_logical_imm32 : Operand<i64> {
let ParserMatchClass = SVELogicalImmOperand<32>;
let PrintMethod = "printLogicalImm<int32_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val);
class SVEPreferredLogicalImmOperand<int Width> : AsmOperandClass {
let Name = "SVEPreferredLogicalImm" # Width;
let PredicateMethod = "isSVEPreferredLogicalImm<int" # Width # "_t>";
let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
def sve_preferred_logical_imm16 : Operand<i64> {
let ParserMatchClass = SVEPreferredLogicalImmOperand<16>;
let PrintMethod = "printSVELogicalImm<int16_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val) &&
def sve_preferred_logical_imm32 : Operand<i64> {
let ParserMatchClass = SVEPreferredLogicalImmOperand<32>;
let PrintMethod = "printSVELogicalImm<int32_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val) &&
def sve_preferred_logical_imm64 : Operand<i64> {
let ParserMatchClass = SVEPreferredLogicalImmOperand<64>;
let PrintMethod = "printSVELogicalImm<int64_t>";
let MCOperandPredicate = [{
if (!MCOp.isImm())
return false;
int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
return AArch64_AM::isSVEMaskOfIdenticalElements<int64_t>(Val) &&
class SVELogicalImmNotOperand<int Width> : AsmOperandClass {
let Name = "SVELogicalImm" # Width # "Not";
let DiagnosticType = "LogicalSecondSource";
let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
let RenderMethod = "addLogicalImmNotOperands<int" # Width # "_t>";
def sve_logical_imm8_not : Operand<i64> {
let ParserMatchClass = SVELogicalImmNotOperand<8>;
def sve_logical_imm16_not : Operand<i64> {
let ParserMatchClass = SVELogicalImmNotOperand<16>;
def sve_logical_imm32_not : Operand<i64> {
let ParserMatchClass = SVELogicalImmNotOperand<32>;
class SVEShiftedImmOperand<int ElementWidth, string Infix, string Predicate>
: AsmOperandClass {
let Name = "SVE" # Infix # "Imm" # ElementWidth;
let DiagnosticType = "Invalid" # Name;
let RenderMethod = "addImmWithOptionalShiftOperands<8>";
let ParserMethod = "tryParseImmWithOptionalShift";
let PredicateMethod = Predicate;
def SVECpyImmOperand8 : SVEShiftedImmOperand<8, "Cpy", "isSVECpyImm<int8_t>">;
def SVECpyImmOperand16 : SVEShiftedImmOperand<16, "Cpy", "isSVECpyImm<int16_t>">;
def SVECpyImmOperand32 : SVEShiftedImmOperand<32, "Cpy", "isSVECpyImm<int32_t>">;
def SVECpyImmOperand64 : SVEShiftedImmOperand<64, "Cpy", "isSVECpyImm<int64_t>">;
def SVEAddSubImmOperand8 : SVEShiftedImmOperand<8, "AddSub", "isSVEAddSubImm<int8_t>">;
def SVEAddSubImmOperand16 : SVEShiftedImmOperand<16, "AddSub", "isSVEAddSubImm<int16_t>">;
def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<int32_t>">;
def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">;
class imm8_opt_lsl<int ElementWidth, string printType,
AsmOperandClass OpndClass>
: Operand<i32> {
let EncoderMethod = "getImm8OptLsl";
let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">";
let PrintMethod = "printImm8OptLsl<" # printType # ">";
let ParserMatchClass = OpndClass;
let MIOperandInfo = (ops i32imm, i32imm);
def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8>;
def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16>;
def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32>;
def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64>;
def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8>;
def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16>;
def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>;
def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>;
def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>;
def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>;
def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>;
def SVEAddSubImm64Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i64>", []>;
def SVELogicalImm8Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i8>", []>;
def SVELogicalImm16Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16>", []>;
def SVELogicalImm32Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32>", []>;
def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
def SVEArithUImmPat : ComplexPattern<i32, 1, "SelectSVEArithImm", []>;
def SVEArithSImmPat : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;
def SVEShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;
class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
let Name = "SVEExactFPImmOperand" # Suffix;
let DiagnosticType = "Invalid" # Name;
let ParserMethod = "tryParseFPImm<false>";
let PredicateMethod = "isExactFPImm<" # ValA # ", " # ValB # ">";
let RenderMethod = "addExactFPImmOperands<" # ValA # ", " # ValB # ">";
class SVEExactFPImmOperand<string Suffix, string ValA, string ValB> : Operand<i32> {
let PrintMethod = "printExactFPImm<" # ValA # ", " # ValB # ">";
let ParserMatchClass = SVEExactFPImm<Suffix, ValA, ValB>;
def sve_fpimm_half_one
: SVEExactFPImmOperand<"HalfOne", "AArch64ExactFPImm::half",
def sve_fpimm_half_two
: SVEExactFPImmOperand<"HalfTwo", "AArch64ExactFPImm::half",
def sve_fpimm_zero_one
: SVEExactFPImmOperand<"ZeroOne", "AArch64ExactFPImm::zero",
def sve_incdec_imm : Operand<i32>, TImmLeaf<i32, [{
return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
}]> {
let ParserMatchClass = Imm1_16Operand;
let EncoderMethod = "getSVEIncDecImm";
let DecoderMethod = "DecodeSVEIncDecImm";
// This allows i32 immediate extraction from i64 based arithmetic.
def sve_cnt_mul_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">;
def sve_cnt_shl_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, true>">;
// SVE PTrue - These are used extensively throughout the pattern matching so
// it's important we define them first.
class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
ValueType vt, SDPatternOperator op>
: I<(outs pprty:$Pd), (ins sve_pred_enum:$pattern),
asm, "\t$Pd, $pattern",
[(set (vt pprty:$Pd), (op sve_pred_enum:$pattern))]>, Sched<[]> {
bits<4> Pd;
bits<5> pattern;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b011;
let Inst{18-17} = opc{2-1};
let Inst{16} = opc{0};
let Inst{15-10} = 0b111000;
let Inst{9-5} = pattern;
let Inst{4} = 0b0;
let Inst{3-0} = Pd;
let Defs = !if(!eq (opc{0}, 1), [NZCV], []);
multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_ptrue<0b00, opc, asm, PPR8, nxv16i1, op>;
def _H : sve_int_ptrue<0b01, opc, asm, PPR16, nxv8i1, op>;
def _S : sve_int_ptrue<0b10, opc, asm, PPR32, nxv4i1, op>;
def _D : sve_int_ptrue<0b11, opc, asm, PPR64, nxv2i1, op>;
def : InstAlias<asm # "\t$Pd",
(!cast<Instruction>(NAME # _B) PPR8:$Pd, 0b11111), 1>;
def : InstAlias<asm # "\t$Pd",
(!cast<Instruction>(NAME # _H) PPR16:$Pd, 0b11111), 1>;
def : InstAlias<asm # "\t$Pd",
(!cast<Instruction>(NAME # _S) PPR32:$Pd, 0b11111), 1>;
def : InstAlias<asm # "\t$Pd",
(!cast<Instruction>(NAME # _D) PPR64:$Pd, 0b11111), 1>;
def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>;
let Predicates = [HasSVE] in {
defm PTRUE : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>;
defm PTRUES : sve_int_ptrue<0b001, "ptrues", null_frag>;
// SVE pattern match helpers.
class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
Instruction inst>
: Pat<(vtd (op vt1:$Op1)),
(inst $Op1)>;
class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))),
(inst $Op1, i32:$imm, i32:$shift)>;
class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
(inst $Op1, i32:$imm, i32:$shift)>;
class SVE_1_Op_Imm_Arith_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
(inst $Op1, i32:$imm)>;
class SVE_1_Op_Imm_Shift_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
ZPRRegOp zprty, Operand ImmTy, Instruction inst>
: Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))),
(inst $Op1, ImmTy:$imm)>;
class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
(inst $Op1, i32:$imm)>;
class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
ValueType it, ComplexPattern cpx, Instruction inst>
: Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))),
(inst $Op1, i64:$imm)>;
class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, Instruction inst>
: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
(inst $Op1, $Op2)>;
class SVE_2_Op_Pat_Reduce_To_Neon<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, Instruction inst, SubRegIndex sub>
: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
(INSERT_SUBREG (vtd (IMPLICIT_DEF)), (inst $Op1, $Op2), sub)>;
class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, Instruction inst>
: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)),
(inst $Op1, $Op2, $Op3)>;
class SVE_4_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, ValueType vt4,
Instruction inst>
: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, vt4:$Op4)),
(inst $Op1, $Op2, $Op3, $Op4)>;
class SVE_2_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, Operand ImmTy, Instruction inst>
: Pat<(vtd (op vt1:$Op1, (vt2 ImmTy:$Op2))),
(inst $Op1, ImmTy:$Op2)>;
class SVE_3_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, Operand ImmTy,
Instruction inst>
: Pat<(vtd (op vt1:$Op1, vt2:$Op2, (vt3 ImmTy:$Op3))),
(inst $Op1, $Op2, ImmTy:$Op3)>;
class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, ValueType vt4,
Operand ImmTy, Instruction inst>
: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
(inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;
def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>;
def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>;
let AddedComplexity = 1 in {
class SVE_3_Op_Pat_SelZero<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, Instruction inst>
: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))),
(inst $Op1, $Op2, $Op3)>;
class SVE_3_Op_Pat_Shift_Imm_SelZero<ValueType vtd, SDPatternOperator op,
ValueType vt1, ValueType vt2,
Operand vt3, Instruction inst>
: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))),
(inst $Op1, $Op2, vt3:$Op3)>;
// Common but less generic patterns.
class SVE_1_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
Instruction inst, Instruction ptrue>
: Pat<(vtd (op vt1:$Op1)),
(inst (IMPLICIT_DEF), (ptrue 31), $Op1)>;
class SVE_2_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
ValueType vt2, Instruction inst, Instruction ptrue>
: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
(inst (ptrue 31), $Op1, $Op2)>;
// Pseudo -> Instruction mappings
def getSVEPseudoMap : InstrMapping {
let FilterClass = "SVEPseudo2Instr";
let RowFields = ["PseudoName"];
let ColFields = ["IsInstr"];
let KeyCol = ["0"];
let ValueCols = [["1"]];
class SVEPseudo2Instr<string name, bit instr> {
string PseudoName = name;
bit IsInstr = instr;
// Lookup e.g. DIV -> DIVR
def getSVERevInstr : InstrMapping {
let FilterClass = "SVEInstr2Rev";
let RowFields = ["InstrName"];
let ColFields = ["isReverseInstr"];
let KeyCol = ["0"];
let ValueCols = [["1"]];
// Lookup e.g. DIVR -> DIV
def getSVENonRevInstr : InstrMapping {
let FilterClass = "SVEInstr2Rev";
let RowFields = ["InstrName"];
let ColFields = ["isReverseInstr"];
let KeyCol = ["1"];
let ValueCols = [["0"]];
class SVEInstr2Rev<string name1, string name2, bit name1IsReverseInstr> {
string InstrName = !if(name1IsReverseInstr, name1, name2);
bit isReverseInstr = name1IsReverseInstr;
// Pseudos for destructive operands
let hasNoSchedulingInfo = 1 in {
class PredTwoOpPseudo<string name, ZPRRegOp zprty,
FalseLanesEnum flags = FalseLanesNone>
: SVEPseudo2Instr<name, 0>,
Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> {
let FalseLanes = flags;
class PredTwoOpImmPseudo<string name, ZPRRegOp zprty, Operand immty,
FalseLanesEnum flags = FalseLanesNone>
: SVEPseudo2Instr<name, 0>,
Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> {
let FalseLanes = flags;
// SVE Predicate Misc Group
class sve_int_pfalse<bits<6> opc, string asm>
: I<(outs PPR8:$Pd), (ins),
asm, "\t$Pd",
[]>, Sched<[]> {
bits<4> Pd;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = opc{5-4};
let Inst{21-19} = 0b011;
let Inst{18-16} = opc{3-1};
let Inst{15-10} = 0b111001;
let Inst{9} = opc{0};
let Inst{8-4} = 0b00000;
let Inst{3-0} = Pd;
class sve_int_ptest<bits<6> opc, string asm>
: I<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
asm, "\t$Pg, $Pn",
[]>, Sched<[]> {
bits<4> Pg;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = opc{5-4};
let Inst{21-19} = 0b010;
let Inst{18-16} = opc{3-1};
let Inst{15-14} = 0b11;
let Inst{13-10} = Pg;
let Inst{9} = opc{0};
let Inst{8-5} = Pn;
let Inst{4-0} = 0b00000;
let Defs = [NZCV];
class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm,
PPRRegOp pprty>
: I<(outs pprty:$Pdn), (ins PPRAny:$Pg, pprty:$_Pdn),
asm, "\t$Pdn, $Pg, $_Pdn",
[]>, Sched<[]> {
bits<4> Pdn;
bits<4> Pg;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b011;
let Inst{18-16} = opc{4-2};
let Inst{15-11} = 0b11000;
let Inst{10-9} = opc{1-0};
let Inst{8-5} = Pg;
let Inst{4} = 0;
let Inst{3-0} = Pdn;
let Constraints = "$Pdn = $_Pdn";
let Defs = [NZCV];
multiclass sve_int_pfirst<bits<5> opc, string asm, SDPatternOperator op> {
def _B : sve_int_pfirst_next<0b01, opc, asm, PPR8>;
def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
multiclass sve_int_pnext<bits<5> opc, string asm, SDPatternOperator op> {
def _B : sve_int_pfirst_next<0b00, opc, asm, PPR8>;
def _H : sve_int_pfirst_next<0b01, opc, asm, PPR16>;
def _S : sve_int_pfirst_next<0b10, opc, asm, PPR32>;
def _D : sve_int_pfirst_next<0b11, opc, asm, PPR64>;
def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, !cast<Instruction>(NAME # _D)>;
// SVE Predicate Count Group
class sve_int_count_r<bits<2> sz8_64, bits<5> opc, string asm,
RegisterOperand dty, PPRRegOp pprty, RegisterOperand sty>
: I<(outs dty:$Rdn), (ins pprty:$Pg, sty:$_Rdn),
asm, "\t$Rdn, $Pg",
[]>, Sched<[]> {
bits<5> Rdn;
bits<4> Pg;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b101;
let Inst{18-16} = opc{4-2};
let Inst{15-11} = 0b10001;
let Inst{10-9} = opc{1-0};
let Inst{8-5} = Pg;
let Inst{4-0} = Rdn;
// Signed 32bit forms require their GPR operand printed.
let AsmString = !if(!eq(opc{4,2-0}, 0b0000),
!strconcat(asm, "\t$Rdn, $Pg, $_Rdn"),
!strconcat(asm, "\t$Rdn, $Pg"));
let Constraints = "$Rdn = $_Rdn";
multiclass sve_int_count_r_s32<bits<5> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64as32>;
def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64as32>;
def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64as32>;
def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64as32>;
def : Pat<(i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))),
(EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>;
def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))))),
(!cast<Instruction>(NAME # _B) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>;
def : Pat<(i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))),
(EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>;
def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))))),
(!cast<Instruction>(NAME # _H) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>;
def : Pat<(i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))),
(EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>;
def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))))),
(!cast<Instruction>(NAME # _S) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>;
def : Pat<(i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))),
(EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>;
def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))))),
(!cast<Instruction>(NAME # _D) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>;
multiclass sve_int_count_r_u32<bits<5> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_count_r<0b00, opc, asm, GPR32z, PPR8, GPR32z>;
def _H : sve_int_count_r<0b01, opc, asm, GPR32z, PPR16, GPR32z>;
def _S : sve_int_count_r<0b10, opc, asm, GPR32z, PPR32, GPR32z>;
def _D : sve_int_count_r<0b11, opc, asm, GPR32z, PPR64, GPR32z>;
def : Pat<(i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _B) PPRAny:$Pg, $Rn)>;
def : Pat<(i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _H) PPRAny:$Pg, $Rn)>;
def : Pat<(i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _S) PPRAny:$Pg, $Rn)>;
def : Pat<(i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _D) PPRAny:$Pg, $Rn)>;
multiclass sve_int_count_r_x64<bits<5> opc, string asm,
SDPatternOperator op = null_frag> {
def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>;
def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>;
def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>;
def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64z>;
def : Pat<(i64 (op GPR64:$Rn, (nxv16i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _B) PPRAny:$Pg, $Rn)>;
def : Pat<(i64 (op GPR64:$Rn, (nxv8i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _H) PPRAny:$Pg, $Rn)>;
def : Pat<(i64 (op GPR64:$Rn, (nxv4i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _S) PPRAny:$Pg, $Rn)>;
def : Pat<(i64 (op GPR64:$Rn, (nxv2i1 PPRAny:$Pg))),
(!cast<Instruction>(NAME # _D) PPRAny:$Pg, $Rn)>;
class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
ZPRRegOp zprty, PPRRegOp pprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm),
asm, "\t$Zdn, $Pm",
[]>, Sched<[]> {
bits<4> Pm;
bits<5> Zdn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b101;
let Inst{18-16} = opc{4-2};
let Inst{15-11} = 0b10000;
let Inst{10-9} = opc{1-0};
let Inst{8-5} = Pm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_int_count_v<bits<5> opc, string asm,
SDPatternOperator op = null_frag> {
def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>;
def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>;
def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, !cast<Instruction>(NAME # _D)>;
def : InstAlias<asm # "\t$Zdn, $Pm",
(!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>;
def : InstAlias<asm # "\t$Zdn, $Pm",
(!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>;
def : InstAlias<asm # "\t$Zdn, $Pm",
(!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>;
class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
PPRRegOp pprty>
: I<(outs GPR64:$Rd), (ins PPRAny:$Pg, pprty:$Pn),
asm, "\t$Rd, $Pg, $Pn",
[]>, Sched<[]> {
bits<4> Pg;
bits<4> Pn;
bits<5> Rd;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b100;
let Inst{18-16} = opc{3-1};
let Inst{15-14} = 0b10;
let Inst{13-10} = Pg;
let Inst{9} = opc{0};
let Inst{8-5} = Pn;
let Inst{4-0} = Rd;
multiclass sve_int_pcount_pred<bits<4> opc, string asm,
SDPatternOperator int_op> {
def _B : sve_int_pcount_pred<0b00, opc, asm, PPR8>;
def _H : sve_int_pcount_pred<0b01, opc, asm, PPR16>;
def _S : sve_int_pcount_pred<0b10, opc, asm, PPR32>;
def _D : sve_int_pcount_pred<0b11, opc, asm, PPR64>;
def : SVE_2_Op_Pat<i64, int_op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<i64, int_op, nxv8i1, nxv8i1, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<i64, int_op, nxv4i1, nxv4i1, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<i64, int_op, nxv2i1, nxv2i1, !cast<Instruction>(NAME # _D)>;
// SVE Element Count Group
class sve_int_count<bits<3> opc, string asm>
: I<(outs GPR64:$Rd), (ins sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
asm, "\t$Rd, $pattern, mul $imm4",
[]>, Sched<[]> {
bits<5> Rd;
bits<4> imm4;
bits<5> pattern;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{2-1};
let Inst{21-20} = 0b10;
let Inst{19-16} = imm4;
let Inst{15-11} = 0b11100;
let Inst{10} = opc{0};
let Inst{9-5} = pattern;
let Inst{4-0} = Rd;
multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> {
def NAME : sve_int_count<opc, asm>;
def : InstAlias<asm # "\t$Rd, $pattern",
(!cast<Instruction>(NAME) GPR64:$Rd, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rd",
(!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>;
def : Pat<(i64 (mul (op sve_pred_enum:$pattern), (sve_cnt_mul_imm i32:$imm))),
(!cast<Instruction>(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>;
def : Pat<(i64 (shl (op sve_pred_enum:$pattern), (i64 (sve_cnt_shl_imm i32:$imm)))),
(!cast<Instruction>(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>;
def : Pat<(i64 (op sve_pred_enum:$pattern)),
(!cast<Instruction>(NAME) sve_pred_enum:$pattern, 1)>;
class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
asm, "\t$Zdn, $pattern, mul $imm4",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> pattern;
bits<4> imm4;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{4-3};
let Inst{21} = 0b1;
let Inst{20} = opc{2};
let Inst{19-16} = imm4;
let Inst{15-12} = 0b1100;
let Inst{11-10} = opc{1-0};
let Inst{9-5} = pattern;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty,
SDPatternOperator op = null_frag,
ValueType vt = OtherVT> {
def NAME : sve_int_countvlv<opc, asm, zprty>;
def : InstAlias<asm # "\t$Zdn, $pattern",
(!cast<Instruction>(NAME) zprty:$Zdn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Zdn",
(!cast<Instruction>(NAME) zprty:$Zdn, 0b11111, 1), 2>;
def : Pat<(vt (op (vt zprty:$Zn), (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))),
(!cast<Instruction>(NAME) $Zn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>;
class sve_int_pred_pattern_a<bits<3> opc, string asm>
: I<(outs GPR64:$Rdn), (ins GPR64:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
asm, "\t$Rdn, $pattern, mul $imm4",
[]>, Sched<[]> {
bits<5> Rdn;
bits<5> pattern;
bits<4> imm4;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{2-1};
let Inst{21-20} = 0b11;
let Inst{19-16} = imm4;
let Inst{15-11} = 0b11100;
let Inst{10} = opc{0};
let Inst{9-5} = pattern;
let Inst{4-0} = Rdn;
let Constraints = "$Rdn = $_Rdn";
multiclass sve_int_pred_pattern_a<bits<3> opc, string asm> {
def NAME : sve_int_pred_pattern_a<opc, asm>;
def : InstAlias<asm # "\t$Rdn, $pattern",
(!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rdn",
(!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
class sve_int_pred_pattern_b<bits<5> opc, string asm, RegisterOperand dt,
RegisterOperand st>
: I<(outs dt:$Rdn), (ins st:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
asm, "\t$Rdn, $pattern, mul $imm4",
[]>, Sched<[]> {
bits<5> Rdn;
bits<5> pattern;
bits<4> imm4;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{4-3};
let Inst{21} = 0b1;
let Inst{20} = opc{2};
let Inst{19-16} = imm4;
let Inst{15-12} = 0b1111;
let Inst{11-10} = opc{1-0};
let Inst{9-5} = pattern;
let Inst{4-0} = Rdn;
// Signed 32bit forms require their GPR operand printed.
let AsmString = !if(!eq(opc{2,0}, 0b00),
!strconcat(asm, "\t$Rdn, $_Rdn, $pattern, mul $imm4"),
!strconcat(asm, "\t$Rdn, $pattern, mul $imm4"));
let Constraints = "$Rdn = $_Rdn";
multiclass sve_int_pred_pattern_b_s32<bits<5> opc, string asm,
SDPatternOperator op> {
def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64as32>;
def : InstAlias<asm # "\t$Rd, $Rn, $pattern",
(!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rd, $Rn",
(!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, 0b11111, 1), 2>;
// NOTE: Register allocation doesn't like tied operands of differing register
// class, hence the extra INSERT_SUBREG complication.
def : Pat<(i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))),
(EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32), sve_pred_enum:$pattern, sve_incdec_imm:$imm4), sub_32)>;
def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))))),
(!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32), sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>;
multiclass sve_int_pred_pattern_b_u32<bits<5> opc, string asm,
SDPatternOperator op> {
def NAME : sve_int_pred_pattern_b<opc, asm, GPR32z, GPR32z>;
def : InstAlias<asm # "\t$Rdn, $pattern",
(!cast<Instruction>(NAME) GPR32z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rdn",
(!cast<Instruction>(NAME) GPR32z:$Rdn, 0b11111, 1), 2>;
def : Pat<(i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))),
(!cast<Instruction>(NAME) $Rn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>;
multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm,
SDPatternOperator op> {
def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64z>;
def : InstAlias<asm # "\t$Rdn, $pattern",
(!cast<Instruction>(NAME) GPR64z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
def : InstAlias<asm # "\t$Rdn",
(!cast<Instruction>(NAME) GPR64z:$Rdn, 0b11111, 1), 2>;
def : Pat<(i64 (op GPR64:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))),
(!cast<Instruction>(NAME) $Rn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>;
// SVE Permute - Cross Lane Group
class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
ValueType vt, RegisterClass srcRegType,
SDPatternOperator op>
: I<(outs zprty:$Zd), (ins srcRegType:$Rn),
asm, "\t$Zd, $Rn",
[(set (vt zprty:$Zd), (op srcRegType:$Rn))]>, Sched<[]> {
bits<5> Rn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-10} = 0b100000001110;
let Inst{9-5} = Rn;
let Inst{4-0} = Zd;
multiclass sve_int_perm_dup_r<string asm, SDPatternOperator op> {
def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, nxv16i8, GPR32sp, op>;
def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, nxv8i16, GPR32sp, op>;
def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, nxv4i32, GPR32sp, op>;
def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, nxv2i64, GPR64sp, op>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Rn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, GPR64sp:$Rn), 1>;
class sve_int_perm_dup_i<bits<5> tsz, Operand immtype, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$idx),
asm, "\t$Zd, $Zn$idx",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<7> idx;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = {?,?}; // imm3h
let Inst{21} = 0b1;
let Inst{20-16} = tsz;
let Inst{15-10} = 0b001000;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_perm_dup_i<string asm> {
def _B : sve_int_perm_dup_i<{?,?,?,?,1}, sve_elm_idx_extdup_b, asm, ZPR8> {
let Inst{23-22} = idx{5-4};
let Inst{20-17} = idx{3-0};
def _H : sve_int_perm_dup_i<{?,?,?,1,0}, sve_elm_idx_extdup_h, asm, ZPR16> {
let Inst{23-22} = idx{4-3};
let Inst{20-18} = idx{2-0};
def _S : sve_int_perm_dup_i<{?,?,1,0,0}, sve_elm_idx_extdup_s, asm, ZPR32> {
let Inst{23-22} = idx{3-2};
let Inst{20-19} = idx{1-0};
def _D : sve_int_perm_dup_i<{?,1,0,0,0}, sve_elm_idx_extdup_d, asm, ZPR64> {
let Inst{23-22} = idx{2-1};
let Inst{20} = idx{0};
def _Q : sve_int_perm_dup_i<{1,0,0,0,0}, sve_elm_idx_extdup_q, asm, ZPR128> {
let Inst{23-22} = idx{1-0};
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, sve_elm_idx_extdup_b:$idx), 1>;
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, sve_elm_idx_extdup_h:$idx), 1>;
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, sve_elm_idx_extdup_s:$idx), 1>;
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, sve_elm_idx_extdup_d:$idx), 1>;
def : InstAlias<"mov $Zd, $Zn$idx",
(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, ZPR128:$Zn, sve_elm_idx_extdup_q:$idx), 1>;
def : InstAlias<"mov $Zd, $Bn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, FPR8asZPR:$Bn, 0), 2>;
def : InstAlias<"mov $Zd, $Hn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, FPR16asZPR:$Hn, 0), 2>;
def : InstAlias<"mov $Zd, $Sn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, FPR32asZPR:$Sn, 0), 2>;
def : InstAlias<"mov $Zd, $Dn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>;
def : InstAlias<"mov $Zd, $Qn",
(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm,
ZPRRegOp zprty, RegisterOperand VecList>
: I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b001;
let Inst{12-11} = opc;
let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_perm_tbl<string asm, SDPatternOperator op> {
def _B : sve_int_perm_tbl<0b00, 0b10, asm, ZPR8, Z_b>;
def _H : sve_int_perm_tbl<0b01, 0b10, asm, ZPR16, Z_h>;
def _S : sve_int_perm_tbl<0b10, 0b10, asm, ZPR32, Z_s>;
def _D : sve_int_perm_tbl<0b11, 0b10, asm, ZPR64, Z_d>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 0>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 0>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> {
def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>;
def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>;
def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>;
def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>;
def : Pat<(nxv16i8 (op nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)),
(nxv16i8 (!cast<Instruction>(NAME # _B) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
nxv16i8:$Op2, zsub1),
def : Pat<(nxv8i16 (op nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)),
(nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
nxv8i16:$Op2, zsub1),
def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)),
(nxv4i32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4i32:$Op1, zsub0,
nxv4i32:$Op2, zsub1),
def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)),
(nxv2i64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2i64:$Op1, zsub0,
nxv2i64:$Op2, zsub1),
def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8i16:$Op3)),
(nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
nxv8f16:$Op2, zsub1),
def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4i32:$Op3)),
(nxv4f32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4f32:$Op1, zsub0,
nxv4f32:$Op2, zsub1),
def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2i64:$Op3)),
(nxv2f64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0,
nxv2f64:$Op2, zsub1),
class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b001011;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
multiclass sve2_int_perm_tbx<string asm, SDPatternOperator op> {
def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>;
def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>;
def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>;
def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn),
asm, "\t$Zd, $Zn",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-10} = 0b111000001110;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_perm_reverse_z<string asm, SDPatternOperator op> {
def _B : sve_int_perm_reverse_z<0b00, asm, ZPR8>;
def _H : sve_int_perm_reverse_z<0b01, asm, ZPR16>;
def _S : sve_int_perm_reverse_z<0b10, asm, ZPR32>;
def _D : sve_int_perm_reverse_z<0b11, asm, ZPR64>;
def : SVE_1_Op_Pat<nxv16i8, op, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Pat<nxv8i16, op, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Pat<nxv4i32, op, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Pat<nxv2i64, op, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_1_Op_Pat<nxv8f16, op, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Pat<nxv4f32, op, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Pat<nxv2f64, op, nxv2f64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins pprty:$Pn),
asm, "\t$Pd, $Pn",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-9} = 0b1101000100000;
let Inst{8-5} = Pn;
let Inst{4} = 0b0;
let Inst{3-0} = Pd;
multiclass sve_int_perm_reverse_p<string asm, SDPatternOperator op> {
def _B : sve_int_perm_reverse_p<0b00, asm, PPR8>;
def _H : sve_int_perm_reverse_p<0b01, asm, PPR16>;
def _S : sve_int_perm_reverse_p<0b10, asm, PPR32>;
def _D : sve_int_perm_reverse_p<0b11, asm, PPR64>;
def : SVE_1_Op_Pat<nxv16i1, op, nxv16i1, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Pat<nxv8i1, op, nxv8i1, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Pat<nxv4i1, op, nxv4i1, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Pat<nxv2i1, op, nxv2i1, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
asm, "\t$Zd, $Zn",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz16_64;
let Inst{21-18} = 0b1100;
let Inst{17-16} = opc;
let Inst{15-10} = 0b001110;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_perm_unpk<bits<2> opc, string asm, SDPatternOperator op> {
def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;
def : SVE_1_Op_Pat<nxv8i16, op, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Pat<nxv4i32, op, nxv8i16, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Pat<nxv2i64, op, nxv4i32, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Rm),
asm, "\t$Zdn, $Rm",
[]>, Sched<[]> {
bits<5> Rm;
bits<5> Zdn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-10} = 0b100100001110;
let Inst{9-5} = Rm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
multiclass sve_int_perm_insrs<string asm, SDPatternOperator op> {
def _B : sve_int_perm_insrs<0b00, asm, ZPR8, GPR32>;
def _H : sve_int_perm_insrs<0b01, asm, ZPR16, GPR32>;
def _S : sve_int_perm_insrs<0b10, asm, ZPR32, GPR32>;
def _D : sve_int_perm_insrs<0b11, asm, ZPR64, GPR64>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, i32, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, i32, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, i64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm),
asm, "\t$Zdn, $Vm",
[]>, Sched<[]> {
bits<5> Vm;
bits<5> Zdn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-10} = 0b110100001110;
let Inst{9-5} = Vm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> {
def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>;
def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>;
def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>;
def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, f16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, f64, !cast<Instruction>(NAME # _D)>;
// SVE Permute - Extract Group
class sve_int_perm_extract_i<string asm>
: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn, ZPR8:$Zm, imm0_255:$imm8),
asm, "\t$Zdn, $_Zdn, $Zm, $imm8",
"", []>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bits<8> imm8;
let Inst{31-21} = 0b00000101001;
let Inst{20-16} = imm8{7-3};
let Inst{15-13} = 0b000;
let Inst{12-10} = imm8{2-0};
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_int_perm_extract_i<string asm, SDPatternOperator op> {
def NAME : sve_int_perm_extract_i<asm>;
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, imm0_255,
class sve2_int_perm_extract_i_cons<string asm>
: I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, imm0_255:$imm8),
asm, "\t$Zd, $Zn, $imm8",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<8> imm8;
let Inst{31-21} = 0b00000101011;
let Inst{20-16} = imm8{7-3};
let Inst{15-13} = 0b000;
let Inst{12-10} = imm8{2-0};
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
// SVE Vector Select Group
class sve_int_sel_vvv<bits<2> sz8_64, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins PPRAny:$Pg, zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Pg, $Zn, $Zm",
[]>, Sched<[]> {
bits<4> Pg;
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b11;
let Inst{13-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_sel_vvv<string asm, SDPatternOperator op> {
def _B : sve_int_sel_vvv<0b00, asm, ZPR8>;
def _H : sve_int_sel_vvv<0b01, asm, ZPR16>;
def _S : sve_int_sel_vvv<0b10, asm, ZPR32>;
def _D : sve_int_sel_vvv<0b11, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, ZPR16:$Zn, ZPR16:$Zd), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, ZPR32:$Zn, ZPR32:$Zd), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Zn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, ZPR64:$Zn, ZPR64:$Zd), 1>;
// SVE Predicate Logical Operations Group
class sve_int_pred_log<bits<4> opc, string asm>
: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
asm, "\t$Pd, $Pg/z, $Pn, $Pm",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pg;
bits<4> Pm;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = opc{3-2};
let Inst{21-20} = 0b00;
let Inst{19-16} = Pm;
let Inst{15-14} = 0b01;
let Inst{13-10} = Pg;
let Inst{9} = opc{1};
let Inst{8-5} = Pn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
// SEL has no predication qualifier.
let AsmString = !if(!eq(opc, 0b0011),
!strconcat(asm, "\t$Pd, $Pg, $Pn, $Pm"),
!strconcat(asm, "\t$Pd, $Pg/z, $Pn, $Pm"));
let Defs = !if(!eq (opc{2}, 1), [NZCV], []);
multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
SDPatternOperator op_nopred = null_frag> {
def NAME : sve_int_pred_log<opc, asm>;
def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>;
def : SVE_2_Op_AllActive_Pat<nxv16i1, op_nopred, nxv16i1, nxv16i1,
!cast<Instruction>(NAME), PTRUE_B>;
def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8i1, nxv8i1,
!cast<Instruction>(NAME), PTRUE_H>;
def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4i1, nxv4i1,
!cast<Instruction>(NAME), PTRUE_S>;
def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2i1, nxv2i1,
!cast<Instruction>(NAME), PTRUE_D>;
// SVE Logical Mask Immediate Group
class sve_int_log_imm<bits<2> opc, string asm>
: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, logical_imm64:$imms13),
asm, "\t$Zdn, $_Zdn, $imms13",
"", []>, Sched<[]> {
bits<5> Zdn;
bits<13> imms13;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = opc;
let Inst{21-18} = 0b0000;
let Inst{17-5} = imms13;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DecoderMethod = "DecodeSVELogicalImmInstruction";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_int_log_imm<bits<2> opc, string asm, string alias, SDPatternOperator op> {
def NAME : sve_int_log_imm<opc, asm>;
def : SVE_1_Op_Imm_Log_Pat<nxv16i8, op, ZPR8, i32, SVELogicalImm8Pat, !cast<Instruction>(NAME)>;
def : SVE_1_Op_Imm_Log_Pat<nxv8i16, op, ZPR16, i32, SVELogicalImm16Pat, !cast<Instruction>(NAME)>;
def : SVE_1_Op_Imm_Log_Pat<nxv4i32, op, ZPR32, i32, SVELogicalImm32Pat, !cast<Instruction>(NAME)>;
def : SVE_1_Op_Imm_Log_Pat<nxv2i64, op, ZPR64, i64, SVELogicalImm64Pat, !cast<Instruction>(NAME)>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8:$imm), 4>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16:$imm), 3>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32:$imm), 2>;
def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8_not:$imm), 0>;
def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16_not:$imm), 0>;
def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32_not:$imm), 0>;
def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
(!cast<Instruction>(NAME) ZPR64:$Zdn, logical_imm64_not:$imm), 0>;
class sve_int_dup_mask_imm<string asm>
: I<(outs ZPR64:$Zd), (ins logical_imm64:$imms),
asm, "\t$Zd, $imms",
[]>, Sched<[]> {
bits<5> Zd;
bits<13> imms;
let Inst{31-18} = 0b00000101110000;
let Inst{17-5} = imms;
let Inst{4-0} = Zd;
let isReMaterializable = 1;
let DecoderMethod = "DecodeSVELogicalImmInstruction";
multiclass sve_int_dup_mask_imm<string asm> {
def NAME : sve_int_dup_mask_imm<asm>;
def : InstAlias<"dupm $Zd, $imm",
(!cast<Instruction>(NAME) ZPR8:$Zd, sve_logical_imm8:$imm), 4>;
def : InstAlias<"dupm $Zd, $imm",
(!cast<Instruction>(NAME) ZPR16:$Zd, sve_logical_imm16:$imm), 3>;
def : InstAlias<"dupm $Zd, $imm",
(!cast<Instruction>(NAME) ZPR32:$Zd, sve_logical_imm32:$imm), 2>;
// All Zd.b forms have a CPY/DUP equivalent, hence no byte alias here.
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME) ZPR16:$Zd, sve_preferred_logical_imm16:$imm), 7>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
// SVE Integer Arithmetic - Unpredicated Group.
class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b000;
let Inst{12-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm,
SDPatternOperator op, SDPatternOperator int_op> {
def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
def _D : sve_int_bin_cons_arit_0<0b11, opc, asm, ZPR64>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
// Intrinsic version
def : SVE_2_Op_Pat<nxv16i8, int_op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
// SVE Floating Point Arithmetic - Predicated Group
class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty,
Operand imm_ty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, imm_ty:$i1),
asm, "\t$Zdn, $Pg/m, $_Zdn, $i1",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bit i1;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b011;
let Inst{18-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-6} = 0b0000;
let Inst{5} = i1;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, Operand imm_ty> {
def _H : sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>;
def _S : sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>;
def _D : sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>;
class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-20} = 0b00;
let Inst{19-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, string Ps,
SDPatternOperator op, DestructiveInstTypeEnum flags,
string revname="", bit isReverseInstr=0> {
let DestructiveInstType = flags in {
def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>,
SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>,
SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>,
SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
multiclass sve_fp_2op_p_zds_fscale<bits<4> opc, string asm,
SDPatternOperator op> {
def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> {
def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
def : SVE_3_Op_Pat_SelZero<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
def : SVE_3_Op_Pat_SelZero<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3),
asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bits<3> imm3;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b010;
let Inst{18-16} = imm3;
let Inst{15-10} = 0b100000;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_fp_ftmad<string asm, SDPatternOperator op> {
def _H : sve_fp_ftmad<0b01, asm, ZPR16>;
def _S : sve_fp_ftmad<0b10, asm, ZPR32>;
def _D : sve_fp_ftmad<0b11, asm, ZPR64>;
def : Pat<(nxv8f16 (op (nxv8f16 ZPR16:$Zn), (nxv8f16 ZPR16:$Zm), (i32 imm32_0_7:$imm))),
(!cast<Instruction>(NAME # _H) ZPR16:$Zn, ZPR16:$Zm, imm32_0_7:$imm)>;
def : Pat<(nxv4f32 (op (nxv4f32 ZPR32:$Zn), (nxv4f32 ZPR32:$Zm), (i32 imm32_0_7:$imm))),
(!cast<Instruction>(NAME # _S) ZPR32:$Zn, ZPR32:$Zm, imm32_0_7:$imm)>;
def : Pat<(nxv2f64 (op (nxv2f64 ZPR64:$Zn), (nxv2f64 ZPR64:$Zm), (i32 imm32_0_7:$imm))),
(!cast<Instruction>(NAME # _D) ZPR64:$Zn, ZPR64:$Zm, imm32_0_7:$imm)>;
// SVE Floating Point Arithmetic - Unpredicated Group
class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b000;
let Inst{12-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
// SVE Floating Point Fused Multiply-Add Group
class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
asm, "\t$Zda, $Pg/m, $Zn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zda;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-13} = opc;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>;
def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>;
def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
asm, "\t$Zdn, $Pg/m, $Zm, $Za",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Za;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Za;
let Inst{15} = 0b1;
let Inst{14-13} = opc;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>;
def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>;
def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>;
def : SVE_4_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
// SVE Floating Point Multiply-Add - Indexed Group
class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm,
ZPRRegOp zprty1,
ZPRRegOp zprty2, Operand itype>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty1:$Zn, zprty2:$Zm, itype:$iop),
asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-11} = 0;
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm,
SDPatternOperator op> {
def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
def _S : sve_fp_fma_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR3b32, VectorIndexS32b> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
def _D : sve_fp_fma_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR4b64, VectorIndexD32b> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))),
(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>;
def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))),
(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))),
(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
// SVE Floating Point Multiply - Indexed Group
class sve_fp_fmul_by_indexed_elem<bits<2> sz, string asm, ZPRRegOp zprty,
ZPRRegOp zprty2, Operand itype>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty2:$Zm, itype:$iop),
asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-10} = 0b001000;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> {
def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
def _S : sve_fp_fmul_by_indexed_elem<0b10, asm, ZPR32, ZPR3b32, VectorIndexS32b> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
def _D : sve_fp_fmul_by_indexed_elem<0b11, asm, ZPR64, ZPR4b64, VectorIndexD32b> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b_timm:$idx))),
(!cast<Instruction>(NAME # _H) $Op1, $Op2, VectorIndexH32b_timm:$idx)>;
def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b_timm:$idx))),
(!cast<Instruction>(NAME # _S) $Op1, $Op2, VectorIndexS32b_timm:$idx)>;
def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b_timm:$idx))),
(!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b_timm:$idx)>;
// SVE Floating Point Complex Multiply-Add Group
class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm,
asm, "\t$Zda, $Pg/m, $Zn, $Zm, $imm",
"", []>, Sched<[]> {
bits<5> Zda;
bits<3> Pg;
bits<5> Zn;
bits<5> Zm;
bits<2> imm;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21} = 0;
let Inst{20-16} = Zm;
let Inst{15} = 0;
let Inst{14-13} = imm;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_fp_fcmla<string asm, SDPatternOperator op> {
def _H : sve_fp_fcmla<0b01, asm, ZPR16>;
def _S : sve_fp_fcmla<0b10, asm, ZPR32>;
def _D : sve_fp_fcmla<0b11, asm, ZPR64>;
def : Pat<(nxv8f16 (op nxv8i1:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, nxv8f16:$Op4, (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>;
def : Pat<(nxv4f32 (op nxv4i1:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, nxv4f32:$Op4, (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>;
def : Pat<(nxv2f64 (op nxv2i1:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, nxv2f64:$Op4, (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>;
// SVE Floating Point Complex Multiply-Add - Indexed Group
class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm,
ZPRRegOp zprty,
ZPRRegOp zprty2, Operand itype>
: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty2:$Zm, itype:$iop,
asm, "\t$Zda, $Zn, $Zm$iop, $imm",
"", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<2> imm;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-12} = 0b0001;
let Inst{11-10} = imm;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_fp_fcmla_by_indexed_elem<string asm, SDPatternOperator op> {
def _H : sve_fp_fcmla_by_indexed_elem<0b10, asm, ZPR16, ZPR3b16, VectorIndexS32b> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
def _S : sve_fp_fcmla_by_indexed_elem<0b11, asm, ZPR32, ZPR4b32, VectorIndexD32b> {
bits<4> Zm;
bits<1> iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
// SVE Floating Point Complex Addition Group
class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm,
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm, $imm",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bits<3> Pg;
bit imm;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21-17} = 0;
let Inst{16} = imm;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_fp_fcadd<string asm, SDPatternOperator op> {
def _H : sve_fp_fcadd<0b01, asm, ZPR16>;
def _S : sve_fp_fcadd<0b10, asm, ZPR32>;
def _D : sve_fp_fcadd<0b11, asm, ZPR64>;
def : Pat<(nxv8f16 (op nxv8i1:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 complexrotateopodd:$imm))),
(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>;
def : Pat<(nxv4f32 (op nxv4i1:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 complexrotateopodd:$imm))),
(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>;
def : Pat<(nxv2f64 (op nxv2i1:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 complexrotateopodd:$imm))),
(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>;
// SVE2 Floating Point Convert Group
class sve2_fp_convert_precision<bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<3> Pg;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = opc{3-2};
let Inst{21-18} = 0b0010;
let Inst{17-16} = opc{1-0};
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
multiclass sve2_fp_convert_down_narrow<string asm, string op> {
def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;
def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
multiclass sve2_fp_convert_up_long<string asm, string op> {
def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;
def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
multiclass sve2_fp_convert_down_odd_rounding_top<string asm, string op> {
def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;
def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
// SVE2 Floating Point Pairwise Group
class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zm;
bits<5> Zdn;
let Inst{31-24} = 0b01100100;
let Inst{23-22} = sz;
let Inst{21-19} = 0b010;
let Inst{18-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>;
def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>;
def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
// SVE2 Floating Point Widening Multiply-Add - Indexed Group
class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm,
asm, "\t$Zda, $Zn, $Zm$iop",
[]>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<3> Zm;
bits<3> iop;
let Inst{31-21} = 0b01100100101;
let Inst{20-19} = iop{2-1};
let Inst{18-16} = Zm;
let Inst{15-14} = 0b01;
let Inst{13} = opc{1};
let Inst{12} = 0b0;
let Inst{11} = iop{0};
let Inst{10} = opc{0};
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm,
SDPatternOperator op> {
def NAME : sve2_fp_mla_long_by_indexed_elem<opc, asm>;
def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME)>;
// SVE2 Floating Point Widening Multiply-Add Group
class sve2_fp_mla_long<bits<2> opc, string asm>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
asm, "\t$Zda, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-21} = 0b01100100101;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b10;
let Inst{13} = opc{1};
let Inst{12-11} = 0b00;
let Inst{10} = opc{0};
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_fp_mla_long<bits<2> opc, string asm, SDPatternOperator op> {
def NAME : sve2_fp_mla_long<opc, asm>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, !cast<Instruction>(NAME)>;
// SVE Stack Allocation Group
class sve_int_arith_vl<bit opc, string asm>
: I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6),
asm, "\t$Rd, $Rn, $imm6",
[]>, Sched<[]> {
bits<5> Rd;
bits<5> Rn;
bits<6> imm6;
let Inst{31-23} = 0b000001000;
let Inst{22} = opc;
let Inst{21} = 0b1;
let Inst{20-16} = Rn;
let Inst{15-11} = 0b01010;
let Inst{10-5} = imm6;
let Inst{4-0} = Rd;
class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
: I<(outs GPR64:$Rd), (ins simm6_32b:$imm6),
asm, "\t$Rd, $imm6",
[]>, Sched<[]> {
bits<5> Rd;
bits<6> imm6;
let Inst{31-23} = 0b000001001;
let Inst{22} = op;
let Inst{21} = 0b1;
let Inst{20-16} = opc2{4-0};
let Inst{15-11} = 0b01010;
let Inst{10-5} = imm6;
let Inst{4-0} = Rd;
// SVE Permute - In Lane Group
class sve_int_perm_bin_perm_zz<bits<3> opc, bits<2> sz8_64, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
let Inst{12-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_perm_bin_perm_zz<opc, 0b00, asm, ZPR8>;
def _H : sve_int_perm_bin_perm_zz<opc, 0b01, asm, ZPR16>;
def _S : sve_int_perm_bin_perm_zz<opc, 0b10, asm, ZPR32>;
def _D : sve_int_perm_bin_perm_zz<opc, 0b11, asm, ZPR64>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f16, op, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
// SVE Floating Point Unary Operations Group
class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
RegisterOperand o_zprtype, ElementSizeEnum size>
: I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = opc{6-5};
let Inst{21} = 0b0;
let Inst{20-16} = opc{4-0};
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let ElementSize = size;
multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
RegisterOperand i_zprtype,
RegisterOperand o_zprtype,
SDPatternOperator op, ValueType vt1,
ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
def : SVE_3_Op_Pat<vt1, op, vt1, vt2, vt3, !cast<Instruction>(NAME)>;
multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>;
def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
multiclass sve2_fp_flogb<string asm, SDPatternOperator op> {
def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>;
def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>;
def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
multiclass sve2_fp_convert_down_odd_rounding<string asm, string op> {
def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>;
def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
// SVE Floating Point Unary Operations - Unpredicated Group
class sve_fp_2op_u_zd<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn),
asm, "\t$Zd, $Zn",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b001;
let Inst{18-16} = opc;
let Inst{15-10} = 0b001100;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_fp_2op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_2op_u_zd<0b01, opc, asm, ZPR16>;
def _S : sve_fp_2op_u_zd<0b10, opc, asm, ZPR32>;
def _D : sve_fp_2op_u_zd<0b11, opc, asm, ZPR64>;
def : SVE_1_Op_Pat<nxv8f16, op, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Pat<nxv4f32, op, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Pat<nxv2f64, op, nxv2f64, !cast<Instruction>(NAME # _D)>;
// SVE Integer Arithmetic - Binary Predicated Group
class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-19} = fmt;
let Inst{18-16} = opc;
let Inst{15-13} = 0b000;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_bin_pred_log<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, string Ps,
SDPatternOperator op,
DestructiveInstTypeEnum flags,
string revname="", bit isReverseInstr=0> {
let DestructiveInstType = flags in {
def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>,
SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>,
SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>,
SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>,
SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>;
def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>;
def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
// Special case for divides which are not defined for 8b/16b elements.
multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm, string Ps,
SDPatternOperator op,
DestructiveInstTypeEnum flags,
string revname="", bit isReverseInstr=0> {
let DestructiveInstType = flags in {
def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>,
SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>,
SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
// SVE Integer Multiply-Add Group
class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
asm, "\t$Zdn, $Pg/m, $Zm, $Za",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Za;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b11;
let Inst{13} = opc;
let Inst{12-10} = Pg;
let Inst{9-5} = Za;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_mladdsub_vvv_pred<bits<1> opc, string asm, SDPatternOperator op> {
def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>;
def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>;
def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>;
def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>;
def : SVE_4_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
asm, "\t$Zda, $Pg/m, $Zn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zda;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b01;
let Inst{13} = opc;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op> {
def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>;
def : SVE_4_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
// SVE2 Integer Multiply-Add - Unpredicated Group
class sve2_int_mla<bits<2> sz, bits<5> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_int_mla<bit S, string asm, SDPatternOperator op> {
def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>;
def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>;
def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>;
def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve2_int_mla_long<bits<5> opc, string asm, SDPatternOperator op> {
def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>;
def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>;
def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
// SVE2 Integer Multiply-Add - Indexed Group
class sve2_int_mla_by_indexed_elem<bits<2> sz, bits<6> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
ZPRRegOp zprty3, Operand itype>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm,
SDPatternOperator op> {
def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
// SVE2 Integer Multiply-Add Long - Indexed Group
multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, SDPatternOperator op> {
def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{20-19} = iop{2-1};
let Inst{18-16} = Zm;
let Inst{11} = iop{0};
def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
bits<4> Zm;
bits<2> iop;
let Inst{20} = iop{1};
let Inst{19-16} = Zm;
let Inst{11} = iop{0};
def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
// SVE Integer Dot Product Group
class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), asm,
"\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-23} = 0b010001001;
let Inst{22} = sz;
let Inst{21} = 0;
let Inst{20-16} = Zm;
let Inst{15-11} = 0;
let Inst{10} = U;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> {
def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _D)>;
// SVE Integer Dot Product Group - Indexed Group
class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
ZPRRegOp zprty3, Operand itype>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
asm, "\t$Zda, $Zn, $Zm$iop",
"", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
let Inst{31-23} = 0b010001001;
let Inst{22} = sz;
let Inst{21} = 0b1;
let Inst{15-11} = 0;
let Inst{10} = U;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
SDPatternOperator op> {
def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {
bits<1> iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b_timm:$idx))),
(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b_timm:$idx))),
(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
// SVE2 Complex Integer Dot Product Group
class sve2_complex_int_arith<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm,
asm, "\t$Zda, $Zn, $Zm, $rot", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
bits<2> rot;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-12} = opc;
let Inst{11-10} = rot;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_cintx_dot<string asm, SDPatternOperator op> {
def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>;
def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>;
def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3),
(i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, complexrotateop:$imm)>;
def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
(i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, complexrotateop:$imm)>;
// SVE2 Complex Multiply-Add Group
multiclass sve2_int_cmla<bit opc, string asm, SDPatternOperator op> {
def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>;
def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>;
def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>;
def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>;
def : SVE_4_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, i32, complexrotateop, !cast<Instruction>(NAME # _B)>;
def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, complexrotateop, !cast<Instruction>(NAME # _H)>;
def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, complexrotateop, !cast<Instruction>(NAME # _S)>;
def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, complexrotateop, !cast<Instruction>(NAME # _D)>;
// SVE2 Complex Integer Dot Product - Indexed Group
class sve2_complex_int_arith_indexed<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
ZPRRegOp zprty3, Operand itype>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop,
asm, "\t$Zda, $Zn, $Zm$iop, $rot", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<2> rot;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-12} = opc;
let Inst{11-10} = rot;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_cintx_dot_by_indexed_elem<string asm, SDPatternOperator op> {
def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
bit iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3),
(i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
(i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
// SVE2 Complex Multiply-Add - Indexed Group
multiclass sve2_cmla_by_indexed_elem<bit opc, string asm,
SDPatternOperator op> {
def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS32b> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD32b> {
bit iop;
bits<4> Zm;
let Inst{20} = iop;
let Inst{19-16} = Zm;
def : Pat<(nxv8i16 (op (nxv8i16 ZPR16:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
(i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # "_H") ZPR16:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv4i32 ZPR32:$Op2), (nxv4i32 ZPR32:$Op3),
(i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
(!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR32:$Op2, ZPR32:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
// SVE2 Integer Multiply - Unpredicated Group
class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
let Inst{12-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
def _D : sve2_int_mul<0b11, opc, asm, ZPR64>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve2_int_mul_single<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
// SVE2 Integer Multiply - Indexed Group
class sve2_int_mul_by_indexed_elem<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
ZPRRegOp zprty3, Operand itype>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm, itype:$iop),
asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{15-14} = 0b11;
let Inst{13-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm,
SDPatternOperator op> {
def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> {
bits<3> Zm;
bits<2> iop;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> {
bits<4> Zm;
bit iop;
let Inst{20} = iop;
let Inst{19-16} = Zm;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm,
SDPatternOperator op> {
def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{20-19} = iop{2-1};
let Inst{18-16} = Zm;
let Inst{11} = iop{0};
def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
bits<4> Zm;
bits<2> iop;
let Inst{20} = iop{1};
let Inst{19-16} = Zm;
let Inst{11} = iop{0};
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
// SVE2 Integer - Predicated Group
class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
bits<3> Pg;
bits<5> Zm;
bits<5> Zdn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21-20} = 0b01;
let Inst{20-16} = opc{5-1};
let Inst{15-14} = 0b10;
let Inst{13} = opc{0};
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op> {
def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins PPR3bAny:$Pg, zprty1:$_Zda, zprty2:$Zn),
asm, "\t$Zda, $Pg/m, $Zn", "", []>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> Zda;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21-17} = 0b00010;
let Inst{16} = U;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty1.ElementSize;
multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm, SDPatternOperator op> {
def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>;
def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>;
def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b01000100;
let Inst{23-22} = sz;
let Inst{21-20} = 0b00;
let Inst{19} = Q;
let Inst{18} = 0b0;
let Inst{17-16} = opc;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm,
SDPatternOperator op> {
def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
multiclass sve2_int_un_pred_arit<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>;
def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>;
def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
// SVE2 Widening Integer Arithmetic Group
class sve2_wide_int_arith<bits<2> sz, bits<5> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2, ZPRRegOp zprty3>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_wide_int_arith_long<bits<5> opc, string asm,
SDPatternOperator op> {
def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>;
def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>;
def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm,
SDPatternOperator op> {
def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
multiclass sve2_wide_int_arith_pmul<bits<2> sz, bits<5> opc, string asm,
SDPatternOperator op> {
def NAME : sve2_wide_int_arith<sz, opc, asm, ZPR128, ZPR64, ZPR64>;
// To avoid using 128 bit elements in the IR, the pattern below works with
// llvm intrinsics with the _pair suffix, to reflect that
// _Q is implemented as a pair of _D.
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
multiclass sve2_pmul_long<bits<1> opc, string asm, SDPatternOperator op> {
def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>;
def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>;
// To avoid using 128 bit elements in the IR, the patterns below work with
// llvm intrinsics with the _pair suffix, to reflect that
// _H is implemented as a pair of _B and _D is implemented as a pair of _S.
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
// SVE2 Misc Group
class sve2_misc<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b10;
let Inst{13-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_misc_bitwise<bits<4> opc, string asm, SDPatternOperator op> {
def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>;
def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>;
def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>;
def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm,
SDPatternOperator op> {
def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-11} = 0b10010;
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_bitwise_xor_interleaved<bit opc, string asm,
SDPatternOperator op> {
def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>;
def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2,
Operand immtype>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> imm;
let Inst{31-23} = 0b010001010;
let Inst{22} = tsz8_64{2};
let Inst{21} = 0b0;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-12} = 0b1010;
let Inst{11-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm,
SDPatternOperator op> {
def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
ZPR16, ZPR8, vecshiftL8>;
def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
ZPR32, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
def _D : sve2_bitwise_shift_left_long<{1,?,?}, opc, asm,
ZPR64, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>;
// SVE2 Accumulate Group
class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<6> imm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21} = 0b0;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-11} = 0b11110;
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
multiclass sve2_int_bin_shift_imm_left<bit opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
multiclass sve2_int_bin_shift_imm_right<bit opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
asm, "\t$Zda, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<6> imm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21} = 0b0;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-12} = 0b1110;
let Inst{11-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, complexrotateopodd:$rot),
asm, "\t$Zdn, $_Zdn, $Zm, $rot", "", []>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bit rot;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21-17} = 0b00000;
let Inst{16} = opc;
let Inst{15-11} = 0b11011;
let Inst{10} = rot;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> {
def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>;
def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>;
def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>;
def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>;
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, complexrotateopodd, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, complexrotateopodd, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, complexrotateopodd, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>;
class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15-14} = 0b11;
let Inst{13-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {
def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
SDPatternOperator op> {
def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, SDPatternOperator op> {
def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
ZPR32, ZPR32>;
def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
ZPR64, ZPR64>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
// SVE2 Narrowing Group
class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
string asm, ZPRRegOp zprty1,
ZPRRegOp zprty2, Operand immtype>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> imm;
let Inst{31-23} = 0b010001010;
let Inst{22} = tsz8_64{2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-14} = 0b00;
let Inst{13-11} = opc;
let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
tvecshiftR16> {
let Inst{19} = imm{3};
def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
vecshiftR32> {
let Inst{20-19} = imm{4-3};
def : SVE_2_Op_Imm_Pat<nxv16i8, op, nxv8i16, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv4i32, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv2i64, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
string asm, ZPRRegOp zprty1,
ZPRRegOp zprty2, Operand immtype>
: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> imm;
let Inst{31-23} = 0b010001010;
let Inst{22} = tsz8_64{2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-14} = 0b00;
let Inst{13-11} = opc;
let Inst{10} = 0b1;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
tvecshiftR16> {
let Inst{19} = imm{3};
def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
vecshiftR32> {
let Inst{20-19} = imm{4-3};
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv8i16, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv4i32, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv2i64, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
let Inst{12-11} = opc; // S, R
let Inst{10} = 0b0; // Top
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>;
def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>;
def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _S)>;
class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b011;
let Inst{12-11} = opc; // S, R
let Inst{10} = 0b1; // Top
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>;
def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>;
def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _S)>;
class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-23} = 0b010001010;
let Inst{22} = tsz8_64{2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-13} = 0b000010;
let Inst{12-11} = opc;
let Inst{10} = 0b0;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>;
def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>;
def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>;
def : SVE_1_Op_Pat<nxv16i8, op, nxv8i16, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Pat<nxv8i16, op, nxv4i32, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Pat<nxv4i32, op, nxv2i64, !cast<Instruction>(NAME # _S)>;
class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn),
asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-23} = 0b010001010;
let Inst{22} = tsz8_64{2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-13} = 0b000010;
let Inst{12-11} = opc;
let Inst{10} = 0b1;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm,
SDPatternOperator op> {
def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>;
def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>;
def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv8i16, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv4i32, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
// SVE Integer Arithmetic - Unary Predicated Group
class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21-20} = 0b01;
let Inst{19} = opc{0};
let Inst{18-16} = opc{3-1};
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm,
SDPatternOperator op> {
def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm,
SDPatternOperator op> {
def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm,
SDPatternOperator op> {
def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm,
SDPatternOperator op> {
def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
// SVE Integer Wide Immediate - Unpredicated Group
class sve_int_dup_imm<bits<2> sz8_64, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zd), (ins immtype:$imm),
asm, "\t$Zd, $imm",
[]>, Sched<[]> {
bits<5> Zd;
bits<9> imm;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-14} = 0b11100011;
let Inst{13} = imm{8}; // sh
let Inst{12-5} = imm{7-0}; // imm8
let Inst{4-0} = Zd;
let isReMaterializable = 1;
multiclass sve_int_dup_imm<string asm> {
def _B : sve_int_dup_imm<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8>;
def _H : sve_int_dup_imm<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16>;
def _S : sve_int_dup_imm<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32>;
def _D : sve_int_dup_imm<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, cpy_imm8_opt_lsl_i8:$imm), 1>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, cpy_imm8_opt_lsl_i16:$imm), 1>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, cpy_imm8_opt_lsl_i32:$imm), 1>;
def : InstAlias<"mov $Zd, $imm",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>;
def : InstAlias<"fmov $Zd, #0.0",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>;
def : InstAlias<"fmov $Zd, #0.0",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>;
def : InstAlias<"fmov $Zd, #0.0",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>;
class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins fpimmtype:$imm8),
asm, "\t$Zd, $imm8",
[]>, Sched<[]> {
bits<5> Zd;
bits<8> imm8;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-14} = 0b11100111;
let Inst{13} = 0b0;
let Inst{12-5} = imm8;
let Inst{4-0} = Zd;
let isReMaterializable = 1;
multiclass sve_int_dup_fpimm<string asm> {
def _H : sve_int_dup_fpimm<0b01, fpimm16, asm, ZPR16>;
def _S : sve_int_dup_fpimm<0b10, fpimm32, asm, ZPR32>;
def _D : sve_int_dup_fpimm<0b11, fpimm64, asm, ZPR64>;
def : InstAlias<"fmov $Zd, $imm8",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, fpimm16:$imm8), 1>;
def : InstAlias<"fmov $Zd, $imm8",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, fpimm32:$imm8), 1>;
def : InstAlias<"fmov $Zd, $imm8",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, fpimm64:$imm8), 1>;
class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
asm, "\t$Zdn, $_Zdn, $imm",
[]>, Sched<[]> {
bits<5> Zdn;
bits<9> imm;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-19} = 0b100;
let Inst{18-16} = opc;
let Inst{15-14} = 0b11;
let Inst{13} = imm{8}; // sh
let Inst{12-5} = imm{7-0}; // imm8
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_int_arith_imm0<bits<3> opc, string asm,
SDPatternOperator op, SDPatternOperator int_op> {
def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv16i8, op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
// Intrinsic version
def : SVE_1_Op_Imm_OptLsl_Pat<nxv16i8, int_op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, int_op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, int_op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, int_op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;
def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv16i8, op, ZPR8, i32, SVEAddSubImm8Pat, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_OptLsl_Reverse_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
asm, "\t$Zdn, $_Zdn, $imm",
[]>, Sched<[]> {
bits<5> Zdn;
bits<8> imm;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21-16} = opc;
let Inst{15-13} = 0b110;
let Inst{12-5} = imm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_int_arith_imm1<bits<2> opc, string asm, SDPatternOperator op> {
def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, simm8>;
def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, simm8>;
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> {
def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, imm0_255>;
def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, imm0_255>;
def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
def _B : sve_int_arith_imm<0b00, 0b110000, asm, ZPR8, simm8>;
def _H : sve_int_arith_imm<0b01, 0b110000, asm, ZPR16, simm8>;
def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
// SVE Bitwise Logical - Unpredicated Group
class sve_int_bin_cons_log<bits<2> opc, string asm>
: I<(outs ZPR64:$Zd), (ins ZPR64:$Zn, ZPR64:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{1-0};
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b001100;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_bin_cons_log<bits<2> opc, string asm, SDPatternOperator op> {
def NAME : sve_int_bin_cons_log<opc, asm>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 1>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 1>;
def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
(!cast<Instruction>(NAME) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 1>;
class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, ZPR64:$Zm, ZPR64:$Zk),
asm, "\t$Zdn, $_Zdn, $Zm, $Zk",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zk;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{2-1};
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-11} = 0b00111;
let Inst{10} = opc{0};
let Inst{9-5} = Zk;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op> {
def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
(!cast<Instruction>(NAME) ZPR8:$Zdn, ZPR8:$Zm, ZPR8:$Zk), 1>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
(!cast<Instruction>(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>;
def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
(!cast<Instruction>(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, immtype:$imm),
asm, "\t$Zdn, $_Zdn, $Zm, $imm",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
bits<6> imm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-10} = 0b001101;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> {
def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>;
def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
def _S : sve2_int_rotate_right_imm<{0,1,?,?}, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
def _D : sve2_int_rotate_right_imm<{1,?,?,?}, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
// SVE Integer Wide Immediate - Predicated Group
class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype,
string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPRAny:$Pg, fpimmtype:$imm8),
asm, "\t$Zd, $Pg/m, $imm8",
[]>, Sched<[]> {
bits<4> Pg;
bits<5> Zd;
bits<8> imm8;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz;
let Inst{21-20} = 0b01;
let Inst{19-16} = Pg;
let Inst{15-13} = 0b110;
let Inst{12-5} = imm8;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_dup_fpimm_pred<string asm> {
def _H : sve_int_dup_fpimm_pred<0b01, fpimm16, asm, ZPR16>;
def _S : sve_int_dup_fpimm_pred<0b10, fpimm32, asm, ZPR32>;
def _D : sve_int_dup_fpimm_pred<0b11, fpimm64, asm, ZPR64>;
def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, fpimm16:$imm8), 1>;
def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, fpimm32:$imm8), 1>;
def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, fpimm64:$imm8), 1>;
class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
ZPRRegOp zprty, string pred_qual, dag iops>
: I<(outs zprty:$Zd), iops,
asm, "\t$Zd, $Pg"#pred_qual#", $imm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<4> Pg;
bits<9> imm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-20} = 0b01;
let Inst{19-16} = Pg;
let Inst{15} = 0b0;
let Inst{14} = m;
let Inst{13} = imm{8}; // sh
let Inst{12-5} = imm{7-0}; // imm8
let Inst{4-0} = Zd;
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_dup_imm_pred_merge_inst<
bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
let Constraints = "$Zd = $_Zd" in
def NAME : sve_int_dup_imm_pred<sz8_64, 1, asm, zprty, "/m",
(ins zprty:$_Zd, PPRAny:$Pg, cpyimm:$imm)>;
def : InstAlias<"mov $Zd, $Pg/m, $imm",
(!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
def : Pat<(intty
(vselect predty:$Pg,
(intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
(!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>;
multiclass sve_int_dup_imm_pred_merge<string asm> {
defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
i32, cpy_imm8_opt_lsl_i8>;
defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
i32, cpy_imm8_opt_lsl_i16>;
defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
i32, cpy_imm8_opt_lsl_i32>;
defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
i64, cpy_imm8_opt_lsl_i64>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
multiclass sve_int_dup_imm_pred_zero_inst<
bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
def NAME : sve_int_dup_imm_pred<sz8_64, 0, asm, zprty, "/z",
(ins PPRAny:$Pg, cpyimm:$imm)>;
def : InstAlias<"mov $Zd, $Pg/z, $imm",
(!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
def : Pat<(intty (zext (predty PPRAny:$Ps1))),
(!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
def : Pat<(intty (sext (predty PPRAny:$Ps1))),
(!cast<Instruction>(NAME) PPRAny:$Ps1, -1, 0)>;
def : Pat<(intty (anyext (predty PPRAny:$Ps1))),
(!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
def : Pat<(intty
(vselect predty:$Pg,
(intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
(intty (AArch64dup (scalarty 0))))),
(!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>;
multiclass sve_int_dup_imm_pred_zero<string asm> {
defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
i32, cpy_imm8_opt_lsl_i8>;
defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
i32, cpy_imm8_opt_lsl_i16>;
defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
i32, cpy_imm8_opt_lsl_i32>;
defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
i64, cpy_imm8_opt_lsl_i64>;
// SVE Integer Compare - Vectors Group
class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
PPRRegOp pprty, ZPRRegOp zprty1, ZPRRegOp zprty2>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty1:$Zn, zprty2:$Zm),
asm, "\t$Pd, $Pg/z, $Zn, $Zm",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00100100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15} = opc{2};
let Inst{14} = cmp_1;
let Inst{13} = opc{1};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = [NZCV];
multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
ValueType intvt, sve_int_cmp cmp> {
def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, cc)),
(cmp $Op1, $Op2, $Op3)>;
def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)),
(cmp $Op1, $Op3, $Op2)>;
multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> {
def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>;
def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>;
defm : SVE_SETCC_Pat<cc, invcc, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
defm : SVE_SETCC_Pat<cc, invcc, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
defm : SVE_SETCC_Pat<cc, invcc, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
defm : SVE_SETCC_Pat<cc, invcc, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_cmp_0_wide<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
multiclass sve_int_cmp_1_wide<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_cmp<0b1, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
def _H : sve_int_cmp<0b1, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
def _S : sve_int_cmp<0b1, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
// SVE Integer Compare - Signed Immediate Group
class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
ZPRRegOp zprty,
Operand immtype>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm5),
asm, "\t$Pd, $Pg/z, $Zn, $imm5",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zn;
bits<5> imm5;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b0;
let Inst{20-16} = imm5;
let Inst{15} = opc{2};
let Inst{14} = 0b0;
let Inst{13} = opc{1};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = [NZCV];
let ElementSize = pprty.ElementSize;
multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
ValueType predvt, ValueType intvt,
Operand immtype, Instruction cmp> {
def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
(intvt ZPR:$Zs1),
(intvt (AArch64dup (immtype:$imm))),
(cmp $Pg, $Zs1, immtype:$imm)>;
def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
(intvt (AArch64dup (immtype:$imm))),
(intvt ZPR:$Zs1),
(cmp $Pg, $Zs1, immtype:$imm)>;
multiclass sve_int_scmp_vi<bits<3> opc, string asm, CondCode cc, CondCode commuted_cc> {
def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>;
def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>;
def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>;
def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>;
defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, simm5_32b,
!cast<Instruction>(NAME # _B)>;
defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1, nxv8i16, simm5_32b,
!cast<Instruction>(NAME # _H)>;
defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1, nxv4i32, simm5_32b,
!cast<Instruction>(NAME # _S)>;
defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1, nxv2i64, simm5_64b,
!cast<Instruction>(NAME # _D)>;
// SVE Integer Compare - Unsigned Immediate Group
class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
ZPRRegOp zprty, Operand immtype>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm7),
asm, "\t$Pd, $Pg/z, $Zn, $imm7",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zn;
bits<7> imm7;
let Inst{31-24} = 0b00100100;
let Inst{23-22} = sz8_64;
let Inst{21} = 1;
let Inst{20-14} = imm7;
let Inst{13} = opc{1};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = [NZCV];
multiclass sve_int_ucmp_vi<bits<2> opc, string asm, CondCode cc,
CondCode commuted_cc> {
def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>;
def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>;
def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>;
def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127_64b>;
defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, imm0_127,
!cast<Instruction>(NAME # _B)>;
defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1, nxv8i16, imm0_127,
!cast<Instruction>(NAME # _H)>;
defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1, nxv4i32, imm0_127,
!cast<Instruction>(NAME # _S)>;
defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1, nxv2i64, imm0_127_64b,
!cast<Instruction>(NAME # _D)>;
// SVE Integer Compare - Scalars Group
class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt>
: I<(outs), (ins rt:$Rn, rt:$Rm),
asm, "\t$Rn, $Rm",
[]>, Sched<[]> {
bits<5> Rm;
bits<5> Rn;
let Inst{31-23} = 0b001001011;
let Inst{22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b001000;
let Inst{9-5} = Rn;
let Inst{4} = opc;
let Inst{3-0} = 0b0000;
let Defs = [NZCV];
class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
RegisterClass gprty, PPRRegOp pprty,
ValueType vt, SDPatternOperator op>
: I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm),
asm, "\t$Pd, $Rn, $Rm",
"", []>, Sched<[]> {
bits<4> Pd;
bits<5> Rm;
bits<5> Rn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b000;
let Inst{12-10} = opc{3-1};
let Inst{9-5} = Rn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = [NZCV];
multiclass sve_int_while4_rr<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8, nxv16i1, op>;
def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16, nxv8i1, op>;
def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32, nxv4i1, op>;
def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64, nxv2i1, op>;
def : SVE_2_Op_Pat<nxv16i1, op, i32, i32, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i1, op, i32, i32, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i1, op, i32, i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i1, op, i32, i32, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_while8_rr<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8, nxv16i1, op>;
def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16, nxv8i1, op>;
def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32, nxv4i1, op>;
def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64, nxv2i1, op>;
def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>;
class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
asm, "\t$Pd, $Rn, $Rm",
"", []>, Sched<[]> {
bits<4> Pd;
bits<5> Rm;
bits<5> Rn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b001100;
let Inst{9-5} = Rn;
let Inst{4} = rw;
let Inst{3-0} = Pd;
let Defs = [NZCV];
multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
// SVE Floating Point Fast Reduction Group
class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty, FPRasZPROperand dstOpType>
: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Vd, $Pg, $Zn",
[]>, Sched<[]> {
bits<5> Zn;
bits<5> Vd;
bits<3> Pg;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b000;
let Inst{18-16} = opc;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Vd;
multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16asZPR>;
def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>;
def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
// SVE Floating Point Accumulating Reduction Group
class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
ZPRRegOp zprty, FPRasZPROperand dstOpType>
: I<(outs dstOpType:$Vdn), (ins PPR3bAny:$Pg, dstOpType:$_Vdn, zprty:$Zm),
asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
Sched<[]> {
bits<3> Pg;
bits<5> Vdn;
bits<5> Zm;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-19} = 0b011;
let Inst{18-16} = opc;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Vdn;
let Constraints = "$Vdn = $_Vdn";
multiclass sve_fp_2op_p_vd<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16asZPR>;
def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>;
def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
// SVE Floating Point Compare - Vectors Group
class sve_fp_3op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
ZPRRegOp zprty>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
asm, "\t$Pd, $Pg/z, $Zn, $Zm",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21} = 0b0;
let Inst{20-16} = Zm;
let Inst{15} = opc{2};
let Inst{14} = 0b1;
let Inst{13} = opc{1};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
multiclass sve_fp_3op_p_pd<bits<3> opc, string asm, SDPatternOperator op> {
def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm, SDPatternOperator op,
SDPatternOperator op_nopred>
: sve_fp_3op_p_pd<opc, asm, op> {
def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8f16, nxv8f16,
!cast<Instruction>(NAME # _H), PTRUE_H>;
def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f16, nxv4f16,
!cast<Instruction>(NAME # _H), PTRUE_S>;
def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f16, nxv2f16,
!cast<Instruction>(NAME # _H), PTRUE_D>;
def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f32, nxv4f32,
!cast<Instruction>(NAME # _S), PTRUE_S>;
def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f32, nxv2f32,
!cast<Instruction>(NAME # _S), PTRUE_D>;
def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f64, nxv2f64,
!cast<Instruction>(NAME # _D), PTRUE_D>;
// SVE Floating Point Compare - with Zero Group
class sve_fp_2op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
ZPRRegOp zprty>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Pd, $Pg/z, $Zn, #0.0",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zn;
let Inst{31-24} = 0b01100101;
let Inst{23-22} = sz;
let Inst{21-18} = 0b0100;
let Inst{17-16} = opc{2-1};
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
multiclass sve_fp_2op_p_pd<bits<3> opc, string asm> {
def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
//SVE Index Generation Group
class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
Operand imm_ty>
: I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b),
asm, "\t$Zd, $imm5, $imm5b",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> imm5;
bits<5> imm5b;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = imm5b;
let Inst{15-10} = 0b010000;
let Inst{9-5} = imm5;
let Inst{4-0} = Zd;
multiclass sve_int_index_ii<string asm, SDPatternOperator op> {
def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>;
def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>;
def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
def : Pat<(nxv16i8 (op simm5_8b:$imm5, simm5_8b:$imm5b)),
(!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, simm5_8b:$imm5b)>;
def : Pat<(nxv8i16 (op simm5_16b:$imm5, simm5_16b:$imm5b)),
(!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, simm5_16b:$imm5b)>;
def : Pat<(nxv4i32 (op simm5_32b:$imm5, simm5_32b:$imm5b)),
(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>;
def : Pat<(nxv2i64 (op simm5_64b:$imm5, simm5_64b:$imm5b)),
(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>;
class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType, Operand imm_ty>
: I<(outs zprty:$Zd), (ins imm_ty:$imm5, srcRegType:$Rm),
asm, "\t$Zd, $imm5, $Rm",
"", []>, Sched<[]> {
bits<5> Rm;
bits<5> Zd;
bits<5> imm5;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b010010;
let Inst{9-5} = imm5;
let Inst{4-0} = Zd;
multiclass sve_int_index_ir<string asm, SDPatternOperator op> {
def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>;
def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>;
def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
def : Pat<(nxv16i8 (op simm5_8b:$imm5, GPR32:$Rm)),
(!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
def : Pat<(nxv8i16 (op simm5_16b:$imm5, GPR32:$Rm)),
(!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
def : Pat<(nxv4i32 (op simm5_32b:$imm5, GPR32:$Rm)),
(!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
def : Pat<(nxv2i64 (op simm5_64b:$imm5, GPR64:$Rm)),
(!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType, Operand imm_ty>
: I<(outs zprty:$Zd), (ins srcRegType:$Rn, imm_ty:$imm5),
asm, "\t$Zd, $Rn, $imm5",
"", []>, Sched<[]> {
bits<5> Rn;
bits<5> Zd;
bits<5> imm5;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = imm5;
let Inst{15-10} = 0b010001;
let Inst{9-5} = Rn;
let Inst{4-0} = Zd;
multiclass sve_int_index_ri<string asm, SDPatternOperator op> {
def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>;
def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>;
def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
def : Pat<(nxv16i8 (op GPR32:$Rm, simm5_8b:$imm5)),
(!cast<Instruction>(NAME # "_B") GPR32:$Rm, simm5_8b:$imm5)>;
def : Pat<(nxv8i16 (op GPR32:$Rm, simm5_16b:$imm5)),
(!cast<Instruction>(NAME # "_H") GPR32:$Rm, simm5_16b:$imm5)>;
def : Pat<(nxv4i32 (op GPR32:$Rm, simm5_32b:$imm5)),
(!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>;
def : Pat<(nxv2i64 (op GPR64:$Rm, simm5_64b:$imm5)),
(!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>;
class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zd), (ins srcRegType:$Rn, srcRegType:$Rm),
asm, "\t$Zd, $Rn, $Rm",
"", []>, Sched<[]> {
bits<5> Zd;
bits<5> Rm;
bits<5> Rn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Rm;
let Inst{15-10} = 0b010011;
let Inst{9-5} = Rn;
let Inst{4-0} = Zd;
multiclass sve_int_index_rr<string asm, SDPatternOperator op> {
def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
def : SVE_2_Op_Pat<nxv16i8, op, i32, i32, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i16, op, i32, i32, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>;
// SVE Bitwise Shift - Predicated Group
class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<6> imm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21-20} = 0b00;
let Inst{19-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-8} = tsz8_64{1-0};
let Inst{7-5} = imm{2-0}; // imm3
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveBinaryImm;
let ElementSize = zprty.ElementSize;
multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string psName=""> {
def _B : SVEPseudo2Instr<psName # _B, 1>,
sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
def _H : SVEPseudo2Instr<psName # _H, 1>,
sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{8} = imm{3};
def _S : SVEPseudo2Instr<psName # _S, 1>,
sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{9-8} = imm{4-3};
def _D : SVEPseudo2Instr<psName # _D, 1>,
sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{9-8} = imm{4-3};
multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
string psName,
SDPatternOperator op> {
def _B : SVEPseudo2Instr<psName # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
def _H : SVEPseudo2Instr<psName # _H, 1>,
sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{8} = imm{3};
def _S : SVEPseudo2Instr<psName # _S, 1>,
sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{9-8} = imm{4-3};
def _D : SVEPseudo2Instr<psName # _D, 1>,
sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{9-8} = imm{4-3};
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftL8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> {
def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, tvecshiftL8, FalseLanesZero>;
def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, tvecshiftL32, FalseLanesZero>;
def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, tvecshiftL64, FalseLanesZero>;
def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftL8, !cast<Pseudo>(NAME # _ZERO_B)>;
def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftL16, !cast<Pseudo>(NAME # _ZERO_H)>;
def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftL32, !cast<Pseudo>(NAME # _ZERO_S)>;
def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftL64, !cast<Pseudo>(NAME # _ZERO_D)>;
multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
SDPatternOperator op = null_frag> {
def _B : SVEPseudo2Instr<Ps # _B, 1>,
sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : SVEPseudo2Instr<Ps # _H, 1>,
sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{8} = imm{3};
def _S : SVEPseudo2Instr<Ps # _S, 1>,
sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{9-8} = imm{4-3};
def _D : SVEPseudo2Instr<Ps # _D, 1>,
sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{9-8} = imm{4-3};
def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> {
def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>;
def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>;
def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, vecshiftR32, FalseLanesZero>;
def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, vecshiftR64, FalseLanesZero>;
def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftR8, !cast<Pseudo>(NAME # _ZERO_B)>;
def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftR16, !cast<Pseudo>(NAME # _ZERO_H)>;
def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftR32, !cast<Pseudo>(NAME # _ZERO_S)>;
def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftR64, !cast<Pseudo>(NAME # _ZERO_D)>;
class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
string asm, ZPRRegOp zprty, ZPRRegOp zprty2>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty2:$Zm),
asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21-20} = 0b01;
let Inst{19} = wide;
let Inst{18-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_bin_pred_shift<bits<3> opc, string asm, string Ps,
SDPatternOperator op, string revname, bit isReverseInstr = 0> {
let DestructiveInstType = DestructiveBinaryCommWithRev in {
def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>,
SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>,
SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>,
SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>,
SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_bin_pred_zeroing_bhsd<SDPatternOperator op> {
def _ZERO_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesZero>;
def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
def : SVE_3_Op_Pat_SelZero<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>;
def : SVE_3_Op_Pat_SelZero<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>;
def : SVE_3_Op_Pat_SelZero<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>;
def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;
def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
// SVE Shift - Unpredicated Group
class sve_int_bin_cons_shift_wide<bits<2> sz8_64, bits<2> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, ZPR64:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_64;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-12} = 0b1000;
let Inst{11-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
ZPRRegOp zprty, Operand immtype>
: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
asm, "\t$Zd, $Zn, $imm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<6> imm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = tsz8_64{3-2};
let Inst{21} = 0b1;
let Inst{20-19} = tsz8_64{1-0};
let Inst{18-16} = imm{2-0}; // imm3
let Inst{15-12} = 0b1001;
let Inst{11-10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
let Inst{19} = imm{3};
def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
let Inst{20-19} = imm{4-3};
def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftL8, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftL16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
let Inst{19} = imm{3};
def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
let Inst{20-19} = imm{4-3};
def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
let Inst{22} = imm{5};
let Inst{20-19} = imm{4-3};
def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, vecshiftR8, !cast<Instruction>(NAME # _B)>;
def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, vecshiftR16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
// SVE Memory - Store Group
class sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
RegisterOperand VecList>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<4> imm4;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = msz;
let Inst{22-21} = esz;
let Inst{20} = 0;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
RegisterOperand listty, ZPRRegOp zprty>
def NAME : sve_mem_cst_si<msz, esz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
class sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, Operand immtype>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<4> imm4;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = sz;
let Inst{22-21} = nregs;
let Inst{20} = 1;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, Operand immtype> {
def NAME : sve_mem_est_si<sz, nregs, VecList, asm, immtype>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
class sve_mem_est_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, RegisterOperand gprty>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg, [$Rn, $Rm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = sz;
let Inst{22-21} = nregs;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b011;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
class sve_mem_cst_ss_base<bits<4> dtype, string asm,
RegisterOperand listty, RegisterOperand gprty>
: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg, [$Rn, $Rm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-21} = dtype;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b010;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_cst_ss<bits<4> dtype, string asm,
RegisterOperand listty, ZPRRegOp zprty,
RegisterOperand gprty> {
def NAME : sve_mem_cst_ss_base<dtype, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
class sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand VecList>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<4> imm4;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = msz;
let Inst{22-20} = 0b001;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand listty,
ZPRRegOp zprty> {
def NAME : sve_mem_cstnt_si<msz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
class sve_mem_cstnt_ss_base<bits<2> msz, string asm, RegisterOperand listty,
RegisterOperand gprty>
: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg, [$Rn, $Rm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = msz;
let Inst{22-21} = 0b00;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b011;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def NAME : sve_mem_cstnt_ss_base<msz, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
RegisterOperand listty, ZPRRegOp zprty>
: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
asm, "\t$Zt, $Pg, [$Zn, $Rm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Zn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-22} = opc;
let Inst{21} = 0b0;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve2_mem_sstnt_vs_32_ptrs<bits<3> opc, string asm,
SDPatternOperator op,
ValueType vt> {
def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt),
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>;
multiclass sve2_mem_sstnt_vs_64_ptrs<bits<3> opc, string asm,
SDPatternOperator op,
ValueType vt> {
def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt),
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
RegisterOperand VecList, RegisterOperand zprext>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$Zt, $Pg, [$Rn, $Zm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-22} = opc;
let Inst{21} = scaled;
let Inst{20-16} = Zm;
let Inst{15} = 0b1;
let Inst{14} = xs;
let Inst{13} = 0;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_32b_sst_sv_32_scaled<bits<3> opc, string asm,
SDPatternOperator sxtw_op,
SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
ValueType vt > {
def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, Z_s, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, Z_s, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
multiclass sve_mem_64b_sst_sv_32_scaled<bits<3> opc, string asm,
SDPatternOperator sxtw_op,
SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
ValueType vt > {
def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, Z_d, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, Z_d, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
multiclass sve_mem_64b_sst_sv_32_unscaled<bits<3> opc, string asm,
SDPatternOperator sxtw_op,
SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
ValueType vt> {
def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, Z_d, uxtw_opnd>;
def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, Z_d, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
multiclass sve_mem_32b_sst_sv_32_unscaled<bits<3> opc, string asm,
SDPatternOperator sxtw_op,
SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
ValueType vt> {
def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, Z_s, uxtw_opnd>;
def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, Z_s, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
class sve_mem_sst_sv2<bits<2> msz, bit scaled, string asm,
RegisterOperand zprext>
: I<(outs), (ins Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$Zt, $Pg, [$Rn, $Zm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = msz;
let Inst{22} = 0b0;
let Inst{21} = scaled;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
SDPatternOperator op,
RegisterOperand zprext,
ValueType vt> {
def _SCALED_REAL : sve_mem_sst_sv2<msz, 1, asm, zprext>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt),
(!cast<Instruction>(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm,
SDPatternOperator op,
ValueType vt> {
def _REAL : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
(!cast<Instruction>(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
RegisterOperand VecList, Operand imm_ty>
: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5),
asm, "\t$Zt, $Pg, [$Zn, $imm5]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> imm5;
bits<5> Zn;
bits<5> Zt;
let Inst{31-25} = 0b1110010;
let Inst{24-23} = opc{2-1};
let Inst{22} = 0b1;
let Inst{21} = opc{0};
let Inst{20-16} = imm5;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_32b_sst_vi_ptrs<bits<3> opc, string asm,
Operand imm_ty,
SDPatternOperator op,
ValueType vt> {
def _IMM : sve_mem_sst_vi<opc, asm, ZPR32, Z_s, imm_ty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
(!cast<Instruction>(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
def : Pat<(op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt),
(!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
multiclass sve_mem_64b_sst_vi_ptrs<bits<3> opc, string asm,
Operand imm_ty,
SDPatternOperator op,
ValueType vt> {
def _IMM : sve_mem_sst_vi<opc, asm, ZPR64, Z_d, imm_ty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
(!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
(!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt),
(!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
class sve_mem_z_spill<string asm>
: I<(outs), (ins ZPRAny:$Zt, GPR64sp:$Rn, simm9:$imm9),
asm, "\t$Zt, [$Rn, $imm9, mul vl]",
[]>, Sched<[]> {
bits<5> Rn;
bits<5> Zt;
bits<9> imm9;
let Inst{31-22} = 0b1110010110;
let Inst{21-16} = imm9{8-3};
let Inst{15-13} = 0b010;
let Inst{12-10} = imm9{2-0};
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayStore = 1;
multiclass sve_mem_z_spill<string asm> {
def NAME : sve_mem_z_spill<asm>;
def : InstAlias<asm # "\t$Zt, [$Rn]",
(!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
class sve_mem_p_spill<string asm>
: I<(outs), (ins PPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9),
asm, "\t$Pt, [$Rn, $imm9, mul vl]",
[]>, Sched<[]> {
bits<4> Pt;
bits<5> Rn;
bits<9> imm9;
let Inst{31-22} = 0b1110010110;
let Inst{21-16} = imm9{8-3};
let Inst{15-13} = 0b000;
let Inst{12-10} = imm9{2-0};
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = Pt;
let mayStore = 1;
multiclass sve_mem_p_spill<string asm> {
def NAME : sve_mem_p_spill<asm>;
def : InstAlias<asm # "\t$Pt, [$Rn]",
(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
// SVE Permute - Predicates Group
class sve_int_perm_bin_perm_pp<bits<3> opc, bits<2> sz8_64, string asm,
PPRRegOp pprty>
: I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm),
asm, "\t$Pd, $Pn, $Pm",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pm;
bits<4> Pn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-20} = 0b10;
let Inst{19-16} = Pm;
let Inst{15-13} = 0b010;
let Inst{12-10} = opc;
let Inst{9} = 0b0;
let Inst{8-5} = Pn;
let Inst{4} = 0b0;
let Inst{3-0} = Pd;
multiclass sve_int_perm_bin_perm_pp<bits<3> opc, string asm,
SDPatternOperator op> {
def _B : sve_int_perm_bin_perm_pp<opc, 0b00, asm, PPR8>;
def _H : sve_int_perm_bin_perm_pp<opc, 0b01, asm, PPR16>;
def _S : sve_int_perm_bin_perm_pp<opc, 0b10, asm, PPR32>;
def _D : sve_int_perm_bin_perm_pp<opc, 0b11, asm, PPR64>;
def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_punpk<bit opc, string asm>
: I<(outs PPR16:$Pd), (ins PPR8:$Pn),
asm, "\t$Pd, $Pn",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pn;
let Inst{31-17} = 0b000001010011000;
let Inst{16} = opc;
let Inst{15-9} = 0b0100000;
let Inst{8-5} = Pn;
let Inst{4} = 0b0;
let Inst{3-0} = Pd;
multiclass sve_int_perm_punpk<bit opc, string asm, SDPatternOperator op> {
def NAME : sve_int_perm_punpk<opc, asm>;
def : SVE_1_Op_Pat<nxv8i1, op, nxv16i1, !cast<Instruction>(NAME)>;
def : SVE_1_Op_Pat<nxv4i1, op, nxv8i1, !cast<Instruction>(NAME)>;
def : SVE_1_Op_Pat<nxv2i1, op, nxv4i1, !cast<Instruction>(NAME)>;
class sve_int_rdffr_pred<bit s, string asm>
: I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
asm, "\t$Pd, $Pg/z",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pg;
let Inst{31-23} = 0b001001010;
let Inst{22} = s;
let Inst{21-9} = 0b0110001111000;
let Inst{8-5} = Pg;
let Inst{4} = 0;
let Inst{3-0} = Pd;
let Defs = !if(!eq (s, 1), [NZCV], []);
let Uses = [FFR];
multiclass sve_int_rdffr_pred<bit s, string asm, SDPatternOperator op> {
def _REAL : sve_int_rdffr_pred<s, asm>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def "" : Pseudo<(outs PPR8:$Pd), (ins PPRAny:$Pg), [(set (nxv16i1 PPR8:$Pd), (op (nxv16i1 PPRAny:$Pg)))]>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd, PPRAny:$Pg)>;
class sve_int_rdffr_unpred<string asm> : I<
(outs PPR8:$Pd), (ins),
asm, "\t$Pd",
[]>, Sched<[]> {
bits<4> Pd;
let Inst{31-4} = 0b0010010100011001111100000000;
let Inst{3-0} = Pd;
let Uses = [FFR];
multiclass sve_int_rdffr_unpred<string asm, SDPatternOperator op> {
def _REAL : sve_int_rdffr_unpred<asm>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def "" : Pseudo<(outs PPR8:$Pd), (ins), [(set (nxv16i1 PPR8:$Pd), (op))]>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd)>;
class sve_int_wrffr<string asm, SDPatternOperator op>
: I<(outs), (ins PPR8:$Pn),
asm, "\t$Pn",
[(op (nxv16i1 PPR8:$Pn))]>, Sched<[]> {
bits<4> Pn;
let Inst{31-9} = 0b00100101001010001001000;
let Inst{8-5} = Pn;
let Inst{4-0} = 0b00000;
let hasSideEffects = 1;
let Defs = [FFR];
class sve_int_setffr<string asm, SDPatternOperator op>
: I<(outs), (ins),
asm, "",
[(op)]>, Sched<[]> {
let Inst{31-0} = 0b00100101001011001001000000000000;
let hasSideEffects = 1;
let Defs = [FFR];
// SVE Permute Vector - Predicated Group
class sve_int_perm_clast_rz<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty, RegisterClass rt>
: I<(outs rt:$Rdn), (ins PPR3bAny:$Pg, rt:$_Rdn, zprty:$Zm),
asm, "\t$Rdn, $Pg, $_Rdn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rdn;
bits<5> Zm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b11000;
let Inst{16} = ab;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Rdn;
let Constraints = "$Rdn = $_Rdn";
multiclass sve_int_perm_clast_rz<bit ab, string asm, SDPatternOperator op> {
def _B : sve_int_perm_clast_rz<0b00, ab, asm, ZPR8, GPR32>;
def _H : sve_int_perm_clast_rz<0b01, ab, asm, ZPR16, GPR32>;
def _S : sve_int_perm_clast_rz<0b10, ab, asm, ZPR32, GPR32>;
def _D : sve_int_perm_clast_rz<0b11, ab, asm, ZPR64, GPR64>;
def : SVE_3_Op_Pat<i32, op, nxv16i1, i32, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<i32, op, nxv8i1, i32, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<i32, op, nxv4i1, i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<i64, op, nxv2i1, i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_clast_vz<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty, RegisterClass rt>
: I<(outs rt:$Vdn), (ins PPR3bAny:$Pg, rt:$_Vdn, zprty:$Zm),
asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Vdn;
bits<5> Zm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b10101;
let Inst{16} = ab;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Vdn;
let Constraints = "$Vdn = $_Vdn";
multiclass sve_int_perm_clast_vz<bit ab, string asm, SDPatternOperator op> {
def _B : sve_int_perm_clast_vz<0b00, ab, asm, ZPR8, FPR8>;
def _H : sve_int_perm_clast_vz<0b01, ab, asm, ZPR16, FPR16>;
def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;
def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b10100;
let Inst{16} = ab;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_int_perm_clast_zz<bit ab, string asm, SDPatternOperator op> {
def _B : sve_int_perm_clast_zz<0b00, ab, asm, ZPR8>;
def _H : sve_int_perm_clast_zz<0b01, ab, asm, ZPR16>;
def _S : sve_int_perm_clast_zz<0b10, ab, asm, ZPR32>;
def _D : sve_int_perm_clast_zz<0b11, ab, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty, RegisterClass resultRegType>
: I<(outs resultRegType:$Rd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Rd, $Pg, $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rd;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b10000;
let Inst{16} = ab;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Rd;
multiclass sve_int_perm_last_r<bit ab, string asm, SDPatternOperator op> {
def _B : sve_int_perm_last_r<0b00, ab, asm, ZPR8, GPR32>;
def _H : sve_int_perm_last_r<0b01, ab, asm, ZPR16, GPR32>;
def _S : sve_int_perm_last_r<0b10, ab, asm, ZPR32, GPR32>;
def _D : sve_int_perm_last_r<0b11, ab, asm, ZPR64, GPR64>;
def : SVE_2_Op_Pat<i32, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<i32, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_last_v<bits<2> sz8_64, bit ab, string asm,
ZPRRegOp zprty, RegisterClass dstRegtype>
: I<(outs dstRegtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Vd, $Pg, $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Vd;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-17} = 0b10001;
let Inst{16} = ab;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Vd;
multiclass sve_int_perm_last_v<bit ab, string asm, SDPatternOperator op> {
def _B : sve_int_perm_last_v<0b00, ab, asm, ZPR8, FPR8>;
def _H : sve_int_perm_last_v<0b01, ab, asm, ZPR16, FPR16>;
def _S : sve_int_perm_last_v<0b10, ab, asm, ZPR32, FPR32>;
def _D : sve_int_perm_last_v<0b11, ab, asm, ZPR64, FPR64>;
def : SVE_2_Op_Pat<f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zdn;
bits<5> Zm;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-13} = 0b101100100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeNone;
multiclass sve_int_perm_splice<string asm, SDPatternOperator op> {
def _B : sve_int_perm_splice<0b00, asm, ZPR8>;
def _H : sve_int_perm_splice<0b01, asm, ZPR16>;
def _S : sve_int_perm_splice<0b10, asm, ZPR32>;
def _D : sve_int_perm_splice<0b11, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
ZPRRegOp zprty, RegisterOperand VecList>
: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, VecList:$Zn),
asm, "\t$Zd, $Pg, $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-13} = 0b101101100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_int_perm_splice_cons<string asm> {
def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>;
def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>;
def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>;
def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>;
class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Zd, $Pg/m, $Zn",
[]>, Sched<[]> {
bits<5> Zd;
bits<3> Pg;
bits<5> Zn;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-18} = 0b1001;
let Inst{17-16} = opc;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_perm_rev_rbit<string asm, SDPatternOperator op> {
def _B : sve_int_perm_rev<0b00, 0b11, asm, ZPR8>;
def _H : sve_int_perm_rev<0b01, 0b11, asm, ZPR16>;
def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_perm_rev_revb<string asm,
SDPatternOperator int_op,
SDPatternOperator ir_op> {
def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_1_Op_AllActive_Pat<nxv8i16, ir_op, nxv8i16, !cast<Instruction>(NAME # _H), PTRUE_H>;
def : SVE_1_Op_AllActive_Pat<nxv4i32, ir_op, nxv4i32, !cast<Instruction>(NAME # _S), PTRUE_S>;
def : SVE_1_Op_AllActive_Pat<nxv2i64, ir_op, nxv2i64, !cast<Instruction>(NAME # _D), PTRUE_D>;
multiclass sve_int_perm_rev_revh<string asm, SDPatternOperator op> {
def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>;
def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_perm_rev_revw<string asm, SDPatternOperator op> {
def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegType>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegType:$Rn),
asm, "\t$Zd, $Pg/m, $Rn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-13} = 0b101000101;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_perm_cpy_r<string asm, SDPatternOperator op> {
def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>;
def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>;
def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>;
def _D : sve_int_perm_cpy_r<0b11, asm, ZPR64, GPR64sp>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Rn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>;
def : Pat<(nxv16i8 (op nxv16i1:$pg, i32:$splat, nxv16i8:$passthru)),
(!cast<Instruction>(NAME # _B) $passthru, $pg, $splat)>;
def : Pat<(nxv8i16 (op nxv8i1:$pg, i32:$splat, nxv8i16:$passthru)),
(!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
def : Pat<(nxv4i32 (op nxv4i1:$pg, i32:$splat, nxv4i32:$passthru)),
(!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
def : Pat<(nxv2i64 (op nxv2i1:$pg, i64:$splat, nxv2i64:$passthru)),
(!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
RegisterClass srcRegtype>
: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegtype:$Vn),
asm, "\t$Zd, $Pg/m, $Vn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Vn;
bits<5> Zd;
let Inst{31-24} = 0b00000101;
let Inst{23-22} = sz8_64;
let Inst{21-13} = 0b100000100;
let Inst{12-10} = Pg;
let Inst{9-5} = Vn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>;
def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>;
def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>;
def _D : sve_int_perm_cpy_v<0b11, asm, ZPR64, FPR64>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, FPR8:$Vn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, FPR16:$Vn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>;
def : InstAlias<"mov $Zd, $Pg/m, $Vn",
(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)),
(!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)),
(!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)),
(!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)),
(!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Zd, $Pg, $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-23} = 0b000001011;
let Inst{22} = sz;
let Inst{21-13} = 0b100001100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_perm_compact<string asm, SDPatternOperator op> {
def _S : sve_int_perm_compact<0b0, asm, ZPR32>;
def _D : sve_int_perm_compact<0b1, asm, ZPR64>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
// SVE Memory - Contiguous Load Group
class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
RegisterOperand VecList>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-21} = dtype;
let Inst{20} = nf;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b101;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Uses = !if(!eq(nf, 1), [FFR], []);
let Defs = !if(!eq(nf, 1), [FFR], []);
multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
RegisterOperand listty, ZPRRegOp zprty> {
def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in {
def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>;
multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
ZPRRegOp zprty>
: sve_mem_cld_si_base<dtype, 0, asm, listty, zprty>;
class sve_mem_cldnt_si_base<bits<2> msz, string asm, RegisterOperand VecList>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
[]>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rn;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = msz;
let Inst{22-20} = 0b000;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_cldnt_si<bits<2> msz, string asm, RegisterOperand listty,
ZPRRegOp zprty> {
def NAME : sve_mem_cldnt_si_base<msz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
class sve_mem_cldnt_ss_base<bits<2> msz, string asm, RegisterOperand VecList,
RegisterOperand gprty>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = msz;
let Inst{22-21} = 0b00;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b110;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_cldnt_ss<bits<2> msz, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def NAME : sve_mem_cldnt_ss_base<msz, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
class sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand VecList>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
bits<5> Zt;
bits<5> Rn;
bits<3> Pg;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-20} = 0;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand listty,
ZPRRegOp zprty> {
def NAME : sve_mem_ldqr_si<sz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4), 0>;
class sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand VecList,
RegisterOperand gprty>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rn;
bits<5> Rm;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-21} = 0;
let Inst{20-16} = Rm;
let Inst{15-13} = 0;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def NAME : sve_mem_ldqr_ss<sz, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
class sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
RegisterOperand VecList, Operand immtype>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6),
asm, "\t$Zt, $Pg/z, [$Rn, $imm6]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zt;
bits<6> imm6;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = dtypeh;
let Inst{22} = 1;
let Inst{21-16} = imm6;
let Inst{15} = 0b1;
let Inst{14-13} = dtypel;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
RegisterOperand zlistty, ZPRRegOp zprty, Operand immtype> {
def NAME : sve_mem_ld_dup<dtypeh, dtypel, asm, zlistty, immtype>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm6]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zlistty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm,
RegisterOperand VecList>
: I<(outs VecList:$Zt), iops,
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
[]>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
let Inst{31-25} = 0b1010010;
let Inst{24-21} = dtype;
let Inst{20-16} = Rm;
let Inst{15-14} = 0b01;
let Inst{13} = ff;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Uses = !if(!eq(ff, 1), [FFR], []);
let Defs = !if(!eq(ff, 1), [FFR], []);
multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def "" : sve_mem_cld_ss_base<dtype, 0, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty> {
def _REAL : sve_mem_cld_ss_base<dtype, 1, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 1>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>;
multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
ZPRRegOp zprty>
: sve_mem_cld_si_base<dtype, 1, asm, listty, zprty>;
class sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, Operand immtype>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
[]>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rn;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-21} = nregs;
let Inst{20} = 0;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, Operand immtype> {
def NAME : sve_mem_eld_si<sz, nregs, VecList, asm, immtype>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
class sve_mem_eld_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
string asm, RegisterOperand gprty>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Rn;
bits<5> Zt;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-21} = nregs;
let Inst{20-16} = Rm;
let Inst{15-13} = 0b110;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
// SVE Memory - 32-bit Gather and Unsized Contiguous Group
// bit xs is '1' if offsets are signed
// bit scaled is '1' if the offsets are scaled
class sve_mem_32b_gld_sv<bits<4> opc, bit xs, bit scaled, string asm,
RegisterOperand zprext>
: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<5> Zt;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = opc{3-2};
let Inst{22} = xs;
let Inst{21} = scaled;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Defs = !if(!eq(opc{0}, 1), [FFR], []);
let Uses = !if(!eq(opc{0}, 1), [FFR], []);
multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
SDPatternOperator sxtw_op,
SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
ValueType vt> {
def _UXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 0, 1, asm, uxtw_opnd>;
def _SXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 1, 1, asm, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
(!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
(!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
SDPatternOperator sxtw_op,
SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
ValueType vt> {
def _UXTW_REAL : sve_mem_32b_gld_sv<opc, 0, 0, asm, uxtw_opnd>;
def _SXTW_REAL : sve_mem_32b_gld_sv<opc, 1, 0, asm, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
(!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
(!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
class sve_mem_32b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> Zt;
bits<5> imm5;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = opc{3-2};
let Inst{22-21} = 0b01;
let Inst{20-16} = imm5;
let Inst{15} = 0b1;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Defs = !if(!eq(opc{0}, 1), [FFR], []);
let Uses = !if(!eq(opc{0}, 1), [FFR], []);
multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty,
SDPatternOperator op, ValueType vt> {
def _IMM_REAL : sve_mem_32b_gld_vi<opc, asm, imm_ty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
(!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>;
def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)),
(!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
class sve_mem_prfm_si<bits<2> msz, string asm>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, simm6s1:$imm6),
asm, "\t$prfop, $Pg, [$Rn, $imm6, mul vl]",
[]>, Sched<[]> {
bits<5> Rn;
bits<3> Pg;
bits<6> imm6;
bits<4> prfop;
let Inst{31-22} = 0b1000010111;
let Inst{21-16} = imm6;
let Inst{15} = 0b0;
let Inst{14-13} = msz;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
multiclass sve_mem_prfm_si<bits<2> msz, string asm> {
def NAME : sve_mem_prfm_si<msz, asm>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Rn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
class sve_mem_prfm_ss<bits<3> opc, string asm, RegisterOperand gprty>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$prfop, $Pg, [$Rn, $Rm]",
[]>, Sched<[]> {
bits<5> Rm;
bits<5> Rn;
bits<3> Pg;
bits<4> prfop;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = opc{2-1};
let Inst{22-21} = 0b00;
let Inst{20-16} = Rm;
let Inst{15} = 0b1;
let Inst{14} = opc{0};
let Inst{13} = 0b0;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
RegisterOperand zprext>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$prfop, $Pg, [$Rn, $Zm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<4> prfop;
let Inst{31-23} = 0b100001000;
let Inst{22} = xs;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15} = 0b0;
let Inst{14-13} = msz;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
PatFrag op_sxtw,
PatFrag op_uxtw> {
def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
(!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
(!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
asm, "\t$prfop, $Pg, [$Zn, $imm5]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> imm5;
bits<4> prfop;
let Inst{31-25} = 0b1000010;
let Inst{24-23} = msz;
let Inst{22-21} = 0b00;
let Inst{20-16} = imm5;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
class sve_mem_z_fill<string asm>
: I<(outs ZPRAny:$Zt), (ins GPR64sp:$Rn, simm9:$imm9),
asm, "\t$Zt, [$Rn, $imm9, mul vl]",
[]>, Sched<[]> {
bits<5> Rn;
bits<5> Zt;
bits<9> imm9;
let Inst{31-22} = 0b1000010110;
let Inst{21-16} = imm9{8-3};
let Inst{15-13} = 0b010;
let Inst{12-10} = imm9{2-0};
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_z_fill<string asm> {
def NAME : sve_mem_z_fill<asm>;
def : InstAlias<asm # "\t$Zt, [$Rn]",
(!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
class sve_mem_p_fill<string asm>
: I<(outs PPRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9),
asm, "\t$Pt, [$Rn, $imm9, mul vl]",
[]>, Sched<[]> {
bits<4> Pt;
bits<5> Rn;
bits<9> imm9;
let Inst{31-22} = 0b1000010110;
let Inst{21-16} = imm9{8-3};
let Inst{15-13} = 0b000;
let Inst{12-10} = imm9{2-0};
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = Pt;
let mayLoad = 1;
multiclass sve_mem_p_fill<string asm> {
def NAME : sve_mem_p_fill<asm>;
def : InstAlias<asm # "\t$Pt, [$Rn]",
(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
RegisterOperand VecList>
: I<(outs VecList:$Zt), iops,
asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rm;
bits<5> Zn;
bits<5> Zt;
let Inst{31} = 0b1;
let Inst{30} = opc{4};
let Inst{29-25} = 0b00010;
let Inst{24-23} = opc{3-2};
let Inst{22-21} = 0b00;
let Inst{20-16} = Rm;
let Inst{15} = 0b1;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve2_mem_gldnt_vs_32_ptrs<bits<5> opc, string asm,
SDPatternOperator op,
ValueType vt> {
def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm),
asm, Z_s>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)),
(!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>;
multiclass sve2_mem_gldnt_vs_64_ptrs<bits<5> opc, string asm,
SDPatternOperator op,
ValueType vt> {
def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm),
asm, Z_d>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)),
(!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>;
// SVE Memory - 64-bit Gather Group
// bit xs is '1' if offsets are signed
// bit scaled is '1' if the offsets are scaled
// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
class sve_mem_64b_gld_sv<bits<4> opc, bit xs, bit scaled, bit lsl, string asm,
RegisterOperand zprext>
: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<5> Zt;
let Inst{31-25} = 0b1100010;
let Inst{24-23} = opc{3-2};
let Inst{22} = xs;
let Inst{21} = scaled;
let Inst{20-16} = Zm;
let Inst{15} = lsl;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Defs = !if(!eq(opc{0}, 1), [FFR], []);
let Uses = !if(!eq(opc{0}, 1), [FFR], []);
multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
SDPatternOperator sxtw_op,
SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
ValueType vt> {
def _UXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 0, 1, 0, asm, uxtw_opnd>;
def _SXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 0, asm, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
(!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
(!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
SDPatternOperator sxtw_op,
SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
ValueType vt> {
def _UXTW_REAL : sve_mem_64b_gld_sv<opc, 0, 0, 0, asm, uxtw_opnd>;
def _SXTW_REAL : sve_mem_64b_gld_sv<opc, 1, 0, 0, asm, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _UXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
(!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
(!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
SDPatternOperator op,
RegisterOperand zprext, ValueType vt> {
def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
(!cast<Instruction>(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
SDPatternOperator op, ValueType vt> {
def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>;
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
(!cast<Instruction>(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> Zt;
bits<5> imm5;
let Inst{31-25} = 0b1100010;
let Inst{24-23} = opc{3-2};
let Inst{22-21} = 0b01;
let Inst{20-16} = imm5;
let Inst{15} = 0b1;
let Inst{14-13} = opc{1-0};
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zt;
let mayLoad = 1;
let Defs = !if(!eq(opc{0}, 1), [FFR], []);
let Uses = !if(!eq(opc{0}, 1), [FFR], []);
multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty,
SDPatternOperator op, ValueType vt> {
def _IMM_REAL : sve_mem_64b_gld_vi<opc, asm, imm_ty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
(!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>;
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)),
(!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
RegisterOperand zprext>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
asm, "\t$prfop, $Pg, [$Rn, $Zm]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Rn;
bits<5> Zm;
bits<4> prfop;
let Inst{31-23} = 0b110001000;
let Inst{22} = xs;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15} = lsl;
let Inst{14-13} = msz;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
RegisterOperand sxtw_opnd,
RegisterOperand uxtw_opnd,
PatFrag op_sxtw,
PatFrag op_uxtw> {
def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
(!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
(!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
RegisterOperand zprext, PatFrag frag> {
def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)),
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
asm, "\t$prfop, $Pg, [$Zn, $imm5]",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zn;
bits<5> imm5;
bits<4> prfop;
let Inst{31-25} = 0b1100010;
let Inst{24-23} = msz;
let Inst{22-21} = 0b00;
let Inst{20-16} = imm5;
let Inst{15-13} = 0b111;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = 0b0;
let Inst{3-0} = prfop;
let hasSideEffects = 1;
multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;
def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
// SVE Compute Vector Address Group
class sve_int_bin_cons_misc_0_a<bits<2> opc, bits<2> msz, string asm,
ZPRRegOp zprty, RegisterOperand zprext>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprext:$Zm),
asm, "\t$Zd, [$Zn, $Zm]",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-12} = 0b1010;
let Inst{11-10} = msz;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_bin_cons_misc_0_a_uxtw<bits<2> opc, string asm> {
def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtUXTW8>;
def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtUXTW16>;
def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtUXTW32>;
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtUXTW64>;
multiclass sve_int_bin_cons_misc_0_a_sxtw<bits<2> opc, string asm> {
def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtSXTW8>;
def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtSXTW16>;
def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtSXTW32>;
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtSXTW64>;
multiclass sve_int_bin_cons_misc_0_a_32_lsl<bits<2> opc, string asm> {
def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR32, ZPR32ExtLSL8>;
def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR32, ZPR32ExtLSL16>;
def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR32, ZPR32ExtLSL32>;
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR32, ZPR32ExtLSL64>;
multiclass sve_int_bin_cons_misc_0_a_64_lsl<bits<2> opc, string asm> {
def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtLSL8>;
def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtLSL16>;
def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtLSL32>;
def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtLSL64>;
// SVE Integer Misc - Unpredicated Group
class sve_int_bin_cons_misc_0_b<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b101100;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_bin_cons_misc_0_b<string asm, SDPatternOperator op> {
def _H : sve_int_bin_cons_misc_0_b<0b01, asm, ZPR16>;
def _S : sve_int_bin_cons_misc_0_b<0b10, asm, ZPR32>;
def _D : sve_int_bin_cons_misc_0_b<0b11, asm, ZPR64>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
class sve_int_bin_cons_misc_0_c<bits<8> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn),
asm, "\t$Zd, $Zn",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = opc{7-6};
let Inst{21} = 0b1;
let Inst{20-16} = opc{5-1};
let Inst{15-11} = 0b10111;
let Inst{10} = opc{0};
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_bin_cons_misc_0_c_fexpa<string asm, SDPatternOperator op> {
def _H : sve_int_bin_cons_misc_0_c<0b01000000, asm, ZPR16>;
def _S : sve_int_bin_cons_misc_0_c<0b10000000, asm, ZPR32>;
def _D : sve_int_bin_cons_misc_0_c<0b11000000, asm, ZPR64>;
def : SVE_1_Op_Pat<nxv8f16, op, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_1_Op_Pat<nxv4f32, op, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_1_Op_Pat<nxv2f64, op, nxv2i64, !cast<Instruction>(NAME # _D)>;
// SVE Integer Reduction Group
class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
ZPRRegOp zprty, RegisterClass regtype>
: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
asm, "\t$Vd, $Pg, $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Vd;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_32;
let Inst{21} = 0b0;
let Inst{20-19} = fmt;
let Inst{18-16} = opc;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Vd;
multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
def : SVE_2_Op_Pat<i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm, SDPatternOperator op, SDPatternOperator opSaddv> {
def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>;
def : SVE_2_Op_Pat<i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_2_Op_Pat<i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
def : SVE_2_Op_Pat<i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Pat<i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
def : SVE_2_Op_Pat<i64, opSaddv, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
multiclass sve_int_reduce_1<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>;
def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>;
def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>;
def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>;
def : SVE_2_Op_Pat_Reduce_To_Neon<v16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B), bsub>;
def : SVE_2_Op_Pat_Reduce_To_Neon<v8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H), hsub>;
def : SVE_2_Op_Pat_Reduce_To_Neon<v4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S), ssub>;
def : SVE_2_Op_Pat_Reduce_To_Neon<v2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D), dsub>;
multiclass sve_int_reduce_2<bits<3> opc, string asm, SDPatternOperator op> {
def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>;
def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>;
def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>;
def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>;
def : SVE_2_Op_Pat_Reduce_To_Neon<v16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B), bsub>;
def : SVE_2_Op_Pat_Reduce_To_Neon<v8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H), hsub>;
def : SVE_2_Op_Pat_Reduce_To_Neon<v4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S), ssub>;
def : SVE_2_Op_Pat_Reduce_To_Neon<v2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D), dsub>;
class sve_int_movprfx_pred<bits<2> sz8_32, bits<3> opc, string asm,
ZPRRegOp zprty, string pg_suffix, dag iops>
: I<(outs zprty:$Zd), iops,
asm, "\t$Zd, $Pg"#pg_suffix#", $Zn",
[]>, Sched<[]> {
bits<3> Pg;
bits<5> Zd;
bits<5> Zn;
let Inst{31-24} = 0b00000100;
let Inst{23-22} = sz8_32;
let Inst{21-19} = 0b010;
let Inst{18-16} = opc;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let ElementSize = zprty.ElementSize;
multiclass sve_int_movprfx_pred_merge<bits<3> opc, string asm> {
let Constraints = "$Zd = $_Zd" in {
def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/m",
(ins ZPR8:$_Zd, PPR3bAny:$Pg, ZPR8:$Zn)>;
def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/m",
(ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR16:$Zn)>;
def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/m",
(ins ZPR32:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn)>;
def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/m",
(ins ZPR64:$_Zd, PPR3bAny:$Pg, ZPR64:$Zn)>;
multiclass sve_int_movprfx_pred_zero<bits<3> opc, string asm> {
def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/z",
(ins PPR3bAny:$Pg, ZPR8:$Zn)>;
def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/z",
(ins PPR3bAny:$Pg, ZPR16:$Zn)>;
def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/z",
(ins PPR3bAny:$Pg, ZPR32:$Zn)>;
def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/z",
(ins PPR3bAny:$Pg, ZPR64:$Zn)>;
// SVE Propagate Break Group
class sve_int_brkp<bits<2> opc, string asm>
: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
asm, "\t$Pd, $Pg/z, $Pn, $Pm",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pg;
bits<4> Pm;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
let Inst{23} = 0b0;
let Inst{22} = opc{1};
let Inst{21-20} = 0b00;
let Inst{19-16} = Pm;
let Inst{15-14} = 0b11;
let Inst{13-10} = Pg;
let Inst{9} = 0b0;
let Inst{8-5} = Pn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Defs = !if(!eq (opc{1}, 1), [NZCV], []);
multiclass sve_int_brkp<bits<2> opc, string asm, SDPatternOperator op> {
def NAME : sve_int_brkp<opc, asm>;
def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
// SVE Partition Break Group
class sve_int_brkn<bit S, string asm>
: I<(outs PPR8:$Pdm), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$_Pdm),
asm, "\t$Pdm, $Pg/z, $Pn, $_Pdm",
[]>, Sched<[]> {
bits<4> Pdm;
bits<4> Pg;
bits<4> Pn;
let Inst{31-23} = 0b001001010;
let Inst{22} = S;
let Inst{21-14} = 0b01100001;
let Inst{13-10} = Pg;
let Inst{9} = 0b0;
let Inst{8-5} = Pn;
let Inst{4} = 0b0;
let Inst{3-0} = Pdm;
let Constraints = "$Pdm = $_Pdm";
let Defs = !if(!eq (S, 0b1), [NZCV], []);
multiclass sve_int_brkn<bits<1> opc, string asm, SDPatternOperator op> {
def NAME : sve_int_brkn<opc, asm>;
def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
class sve_int_break<bits<3> opc, string asm, string suffix, dag iops>
: I<(outs PPR8:$Pd), iops,
asm, "\t$Pd, $Pg"#suffix#", $Pn",
[]>, Sched<[]> {
bits<4> Pd;
bits<4> Pg;
bits<4> Pn;
let Inst{31-24} = 0b00100101;
let Inst{23-22} = opc{2-1};
let Inst{21-14} = 0b01000001;
let Inst{13-10} = Pg;
let Inst{9} = 0b0;
let Inst{8-5} = Pn;
let Inst{4} = opc{0};
let Inst{3-0} = Pd;
let Constraints = !if(!eq (opc{0}, 1), "$Pd = $_Pd", "");
let Defs = !if(!eq (opc{1}, 1), [NZCV], []);
multiclass sve_int_break_m<bits<3> opc, string asm, SDPatternOperator op> {
def NAME : sve_int_break<opc, asm, "/m", (ins PPR8:$_Pd, PPRAny:$Pg, PPR8:$Pn)>;
def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
multiclass sve_int_break_z<bits<3> opc, string asm, SDPatternOperator op> {
def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>;
def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
// SVE2 String Processing Group
class sve2_char_match<bit sz, bit opc, string asm,
PPRRegOp pprty, ZPRRegOp zprty>
: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
asm, "\t$Pd, $Pg/z, $Zn, $Zm",
[]>, Sched<[]> {
bits<4> Pd;
bits<3> Pg;
bits<5> Zm;
bits<5> Zn;
let Inst{31-23} = 0b010001010;
let Inst{22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b100;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4} = opc;
let Inst{3-0} = Pd;
let Defs = [NZCV];
multiclass sve2_char_match<bit opc, string asm, SDPatternOperator op> {
def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>;
def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>;
def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
// SVE2 Histogram Computation - Segment Group
class sve2_hist_gen_segment<string asm, SDPatternOperator op>
: I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[(set nxv16i8:$Zd, (op nxv16i8:$Zn, nxv16i8:$Zm))]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-21} = 0b01000101001;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b101000;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
// SVE2 Histogram Computation - Vector Group
class sve2_hist_gen_vector<bit sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Pg/z, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<3> Pg;
bits<5> Zm;
let Inst{31-23} = 0b010001011;
let Inst{22} = sz;
let Inst{21} = 0b1;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b110;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_hist_gen_vector<string asm, SDPatternOperator op> {
def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>;
def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
// SVE2 Crypto Extensions Group
class sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zn;
bits<5> Zm;
let Inst{31-21} = 0b01000101001;
let Inst{20-16} = Zm;
let Inst{15-11} = 0b11110;
let Inst{10} = opc;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty,
SDPatternOperator op, ValueType vt> {
def NAME : sve2_crypto_cons_bin_op<opc, asm, zprty>;
def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm),
asm, "\t$Zdn, $_Zdn, $Zm",
[]>, Sched<[]> {
bits<5> Zdn;
bits<5> Zm;
let Inst{31-17} = 0b010001010010001;
let Inst{16} = opc{1};
let Inst{15-11} = 0b11100;
let Inst{10} = opc{0};
let Inst{9-5} = Zm;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
multiclass sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty,
SDPatternOperator op, ValueType vt> {
def NAME : sve2_crypto_des_bin_op<opc, asm, zprty>;
def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
class sve2_crypto_unary_op<bit opc, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn),
asm, "\t$Zdn, $_Zdn",
[]>, Sched<[]> {
bits<5> Zdn;
let Inst{31-11} = 0b010001010010000011100;
let Inst{10} = opc;
let Inst{9-5} = 0b00000;
let Inst{4-0} = Zdn;
let Constraints = "$Zdn = $_Zdn";
multiclass sve2_crypto_unary_op<bit opc, string asm, SDPatternOperator op> {
def NAME : sve2_crypto_unary_op<opc, asm, ZPR8>;
def : SVE_1_Op_Pat<nxv16i8, op, nxv16i8, !cast<Instruction>(NAME)>;
// SVE BFloat16 Group
class sve_bfloat_dot_base<bits<2> opc, string asm, string ops, dag iops>
: I<(outs ZPR32:$Zda), iops, asm, ops, "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
let Inst{31-21} = 0b01100100011;
let Inst{15-14} = opc;
let Inst{13-10} = 0b0000;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeH;
class sve_bfloat_dot<string asm>
: sve_bfloat_dot_base<0b10, asm, "\t$Zda, $Zn, $Zm",
(ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm)> {
bits<5> Zm;
let Inst{20-16} = Zm;
multiclass sve_bfloat_dot<string asm, SDPatternOperator op> {
def NAME : sve_bfloat_dot<asm>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
class sve_bfloat_dot_indexed<string asm>
: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
(ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> {
bits<2> iop;
bits<3> Zm;
let Inst{20-19} = iop;
let Inst{18-16} = Zm;
multiclass sve_bfloat_dot_indexed<string asm, SDPatternOperator op> {
def NAME : sve_bfloat_dot_indexed<asm>;
def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexS_timm, !cast<Instruction>(NAME)>;
class sve_bfloat_matmul<string asm>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zm;
bits<5> Zda;
bits<5> Zn;
let Inst{31-21} = 0b01100100011;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b111001;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ElementSizeH;
multiclass sve_bfloat_matmul<string asm, SDPatternOperator op> {
def NAME : sve_bfloat_matmul<asm>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
class sve_bfloat_matmul_longvecl<bit BT, string asm>
: sve_bfloat_matmul<asm> {
let Inst{23} = 0b1;
let Inst{14-13} = 0b00;
let Inst{10} = BT;
multiclass sve_bfloat_matmul_longvecl<bit BT, string asm, SDPatternOperator op> {
def NAME : sve_bfloat_matmul_longvecl<BT, asm>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
class sve_bfloat_matmul_longvecl_idx<bit BT, string asm>
: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
(ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> {
bits<3> iop;
bits<3> Zm;
let Inst{23} = 0b1;
let Inst{20-19} = iop{2-1};
let Inst{18-16} = Zm;
let Inst{11} = iop{0};
let Inst{10} = BT;
multiclass sve_bfloat_matmul_longvecl_idx<bit BT, string asm, SDPatternOperator op> {
def NAME : sve_bfloat_matmul_longvecl_idx<BT, asm>;
def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexH_timm, !cast<Instruction>(NAME)>;
class sve_bfloat_convert<bit N, string asm>
: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn),
asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> {
bits<5> Zd;
bits<3> Pg;
bits<5> Zn;
let Inst{31-25} = 0b0110010;
let Inst{24} = N;
let Inst{23-13} = 0b10001010101;
let Inst{12-10} = Pg;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Constraints = "$Zd = $_Zd";
let DestructiveInstType = DestructiveOther;
let hasSideEffects = 1;
let ElementSize = ElementSizeS;
multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
def NAME : sve_bfloat_convert<N, asm>;
def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
// SVE Integer Matrix Multiply Group
class sve_int_matmul<bits<2> uns, string asm>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
"\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-24} = 0b01000101;
let Inst{23-22} = uns;
let Inst{21} = 0;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b100110;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ZPR32.ElementSize;
multiclass sve_int_matmul<bits<2> uns, string asm, SDPatternOperator op> {
def NAME : sve_int_matmul<uns, asm>;
def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
// SVE Integer Dot Product Mixed Sign Group
class sve_int_dot_mixed<string asm>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
"\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-21} = 0b01000100100;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b011110;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ZPR32.ElementSize;
multiclass sve_int_dot_mixed<string asm, SDPatternOperator op> {
def NAME : sve_int_dot_mixed<asm>;
def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
// SVE Integer Dot Product Mixed Sign - Indexed Group
class sve_int_dot_mixed_indexed<bit U, string asm>
: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS32b:$idx),
asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<3> Zm;
bits<2> idx;
let Inst{31-21} = 0b01000100101;
let Inst{20-19} = idx;
let Inst{18-16} = Zm;
let Inst{15-11} = 0b00011;
let Inst{10} = U;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = ZPR32.ElementSize;
multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> {
def NAME : sve_int_dot_mixed_indexed<U, asm>;
def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME)>;
// SVE Floating Point Matrix Multiply Accumulate Group
class sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm),
asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
bits<5> Zda;
bits<5> Zn;
bits<5> Zm;
let Inst{31-23} = 0b011001001;
let Inst{22} = sz;
let Inst{21} = 1;
let Inst{20-16} = Zm;
let Inst{15-10} = 0b111001;
let Inst{9-5} = Zn;
let Inst{4-0} = Zda;
let Constraints = "$Zda = $_Zda";
let DestructiveInstType = DestructiveOther;
let ElementSize = zprty.ElementSize;
multiclass sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty, SDPatternOperator op, ValueType vt> {
def NAME : sve_fp_matrix_mla<sz, asm, zprty>;
def : SVE_3_Op_Pat<vt, op , vt, vt, vt, !cast<Instruction>(NAME)>;
// SVE Memory - Contiguous Load And Replicate 256-bit Group
class sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand VecList>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4),
asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
bits<5> Zt;
bits<5> Rn;
bits<3> Pg;
bits<4> imm4;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-20} = 0b010;
let Inst{19-16} = imm4;
let Inst{15-13} = 0b001;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty,
ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> {
def NAME : sve_mem_ldor_si<sz, asm, listty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>;
// Base addressing mode
- def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)),
- (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>;
+ def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)),
+ (!cast<Instruction>(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>;
+ let AddedComplexity = 2 in {
+ // Reg + Imm addressing mode
+ def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))),
+ (!cast<Instruction>(NAME) $Pg, $base, simm4s32:$imm)>;
+ }
class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList,
RegisterOperand gprty>
: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
bits<5> Zt;
bits<3> Pg;
bits<5> Rn;
bits<5> Rm;
let Inst{31-25} = 0b1010010;
let Inst{24-23} = sz;
let Inst{22-21} = 0b01;
let Inst{20-16} = Rm;
let Inst{15-13} = 0;
let Inst{12-10} = Pg;
let Inst{9-5} = Rn;
let Inst{4-0} = Zt;
let mayLoad = 1;
multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty,
ZPRRegOp zprty, RegisterOperand gprty, ValueType Ty,
ValueType PredTy, SDNode Ld1ro, ComplexPattern AddrCP> {
def NAME : sve_mem_ldor_ss<sz, asm, listty, gprty>;
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))),
(!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>;
// SVE Interleave 128-bit Elements Group
class sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm>
: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm),
asm, "\t$Zd, $Zn, $Zm",
[]>, Sched<[]> {
bits<5> Zd;
bits<5> Zm;
bits<5> Zn;
let Inst{31-21} = 0b00000101101;
let Inst{20-16} = Zm;
let Inst{15-13} = 0b000;
let Inst{12-11} = opc;
let Inst{10} = P;
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatternOperator op> {
def NAME : sve_int_perm_bin_perm_128_zz<opc, P, asm>;
def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME)>;
/// Addressing modes
def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>;
def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>;
def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>;
def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>;
def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>;
// Predicated pseudo floating point two operand instructions.
multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>;
// Predicated pseudo integer two operand instructions.
multiclass sve_int_bin_pred_bhsd<SDPatternOperator op> {
def _UNDEF_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
// As sve_int_bin_pred but when only i32 and i64 vector types are required.
multiclass sve_int_bin_pred_sd<SDPatternOperator op> {
def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 11454841cab7..5c1a4cb16568 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1,17162 +1,17173 @@
//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file implements the PPCISelLowering class.
#include "PPCISelLowering.h"
#include "MCTargetDesc/PPCPredicates.h"
#include "PPC.h"
#include "PPCCCState.h"
#include "PPCCallingConv.h"
#include "PPCFrameLowering.h"
#include "PPCInstrInfo.h"
#include "PPCMachineFunctionInfo.h"
#include "PPCPerfectShuffle.h"
#include "PPCRegisterInfo.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsPowerPC.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSymbolXCOFF.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <list>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "ppc-lowering"
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
static cl::opt<bool> DisableSCO("disable-ppc-sco",
cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
cl::desc("use absolute jump tables on ppc"), cl::Hidden);
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumSiblingCalls, "Number of sibling calls");
STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
// FIXME: Remove this once the bug has been fixed!
extern cl::opt<bool> ANDIGlueBug;
PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
const PPCSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
// arguments are at least 4/8 bytes aligned.
bool isPPC64 = Subtarget.isPPC64();
setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
// Set up the register classes.
addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
if (!useSoftFloat()) {
if (hasSPE()) {
addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
} else {
addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
// Match BITREVERSE to customized fast code sequence in the td file.
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
// Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
if (Subtarget.isISA3_0()) {
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
setTruncStoreAction(MVT::f64, MVT::f16, Legal);
setTruncStoreAction(MVT::f32, MVT::f16, Legal);
} else {
// No extending loads from f16 or HW conversions back and forth.
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
// PowerPC has pre-inc load and store's.
setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
if (!Subtarget.hasSPE()) {
setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
// PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
for (MVT VT : ScalarIntVTs) {
setOperationAction(ISD::ADDC, VT, Legal);
setOperationAction(ISD::ADDE, VT, Legal);
setOperationAction(ISD::SUBC, VT, Legal);
setOperationAction(ISD::SUBE, VT, Legal);
if (Subtarget.useCRBits()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
if (isPPC64 || Subtarget.hasFPCVT()) {
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
isPPC64 ? MVT::i64 : MVT::i32);
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
isPPC64 ? MVT::i64 : MVT::i32);
} else {
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
// PowerPC does not support direct load/store of condition registers.
setOperationAction(ISD::LOAD, MVT::i1, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
// FIXME: Remove this once the ANDI glue bug is fixed:
if (ANDIGlueBug)
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
setTruncStoreAction(VT, MVT::i1, Expand);
addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
// PPC (the libcall is not available).
setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
// We do not currently implement these libm ops for PowerPC.
setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
// PowerPC has no SREM/UREM instructions unless we are on P9
// On P9 we may use a hardware instruction to compute the remainder.
// When the result of both the remainder and the division is required it is
// more efficient to compute the remainder from the result of the division
// rather than use the remainder instruction. The instructions are legalized
// directly because the DivRemPairsPass performs the transformation at the IR
// level.
if (Subtarget.isISA3_0()) {
setOperationAction(ISD::SREM, MVT::i32, Legal);
setOperationAction(ISD::UREM, MVT::i32, Legal);
setOperationAction(ISD::SREM, MVT::i64, Legal);
setOperationAction(ISD::UREM, MVT::i64, Legal);
} else {
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
setOperationAction(ISD::SREM, MVT::i64, Expand);
setOperationAction(ISD::UREM, MVT::i64, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
// Handle constrained floating-point operations of scalar.
// TODO: Handle SPE specific operation.
setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
if (Subtarget.hasVSX())
setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal);
if (Subtarget.hasFSQRT()) {
setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
if (Subtarget.hasFPRND()) {
setOperationAction(ISD::STRICT_FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FFLOOR, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FROUND, MVT::f64, Legal);
// We don't support sin/cos/sqrt/fmod/pow
setOperationAction(ISD::FSIN , MVT::f64, Expand);
setOperationAction(ISD::FCOS , MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FREM , MVT::f64, Expand);
setOperationAction(ISD::FPOW , MVT::f64, Expand);
setOperationAction(ISD::FSIN , MVT::f32, Expand);
setOperationAction(ISD::FCOS , MVT::f32, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
setOperationAction(ISD::FREM , MVT::f32, Expand);
setOperationAction(ISD::FPOW , MVT::f32, Expand);
if (Subtarget.hasSPE()) {
setOperationAction(ISD::FMA , MVT::f64, Expand);
setOperationAction(ISD::FMA , MVT::f32, Expand);
} else {
setOperationAction(ISD::FMA , MVT::f64, Legal);
setOperationAction(ISD::FMA , MVT::f32, Legal);
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
// If we're enabling GP optimizations, use hardware square root
if (!Subtarget.hasFSQRT() &&
!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
setOperationAction(ISD::FSQRT, MVT::f64, Expand);
if (!Subtarget.hasFSQRT() &&
!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
setOperationAction(ISD::FSQRT, MVT::f32, Expand);
if (Subtarget.hasFCPSGN()) {
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
} else {
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
if (Subtarget.hasFPRND()) {
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FROUND, MVT::f64, Legal);
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Legal);
// PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
// to speed up scalar BSWAP64.
// CTPOP or CTTZ were introduced in P8/P9 respectively
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
if (Subtarget.hasP9Vector())
setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
if (Subtarget.isISA3_0()) {
setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
} else {
setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
} else {
setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
// PowerPC does not have ROTR
setOperationAction(ISD::ROTR, MVT::i32 , Expand);
setOperationAction(ISD::ROTR, MVT::i64 , Expand);
if (!Subtarget.useCRBits()) {
// PowerPC does not have Select
setOperationAction(ISD::SELECT, MVT::i32, Expand);
setOperationAction(ISD::SELECT, MVT::i64, Expand);
setOperationAction(ISD::SELECT, MVT::f32, Expand);
setOperationAction(ISD::SELECT, MVT::f64, Expand);
// PowerPC wants to turn select_cc of FP into fsel when possible.
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
// PowerPC wants to optimize integer setcc a bit
if (!Subtarget.useCRBits())
setOperationAction(ISD::SETCC, MVT::i32, Custom);
// PowerPC does not have BRCOND which requires SetCC
if (!Subtarget.useCRBits())
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
if (Subtarget.hasSPE()) {
// SPE has built-in conversions
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
} else {
// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
// PowerPC does not have [U|S]INT_TO_FP
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
if (Subtarget.hasDirectMove() && isPPC64) {
setOperationAction(ISD::BITCAST, MVT::f32, Legal);
setOperationAction(ISD::BITCAST, MVT::i32, Legal);
setOperationAction(ISD::BITCAST, MVT::i64, Legal);
setOperationAction(ISD::BITCAST, MVT::f64, Legal);
if (TM.Options.UnsafeFPMath) {
setOperationAction(ISD::LRINT, MVT::f64, Legal);
setOperationAction(ISD::LRINT, MVT::f32, Legal);
setOperationAction(ISD::LLRINT, MVT::f64, Legal);
setOperationAction(ISD::LLRINT, MVT::f32, Legal);
setOperationAction(ISD::LROUND, MVT::f64, Legal);
setOperationAction(ISD::LROUND, MVT::f32, Legal);
setOperationAction(ISD::LLROUND, MVT::f64, Legal);
setOperationAction(ISD::LLROUND, MVT::f32, Legal);
} else {
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
setOperationAction(ISD::BITCAST, MVT::i64, Expand);
setOperationAction(ISD::BITCAST, MVT::f64, Expand);
// We cannot sextinreg(i1). Expand to shifts.
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
// SjLj exception handling but a light-weight setjmp/longjmp replacement to
// support continuation, user-level threading, and etc.. As a result, no
// other SjLj exception interfaces are implemented and please don't build
// your own exception handling based on them.
// LLVM/Clang supports zero-cost DWARF exception handling.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
// We want to legalize GlobalAddress and ConstantPool nodes into the
// appropriate instructions to materialize the address.
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
setOperationAction(ISD::JumpTable, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
setOperationAction(ISD::JumpTable, MVT::i64, Custom);
// TRAP is legal.
setOperationAction(ISD::TRAP, MVT::Other, Legal);
// TRAMPOLINE is custom lowered.
setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
if (Subtarget.is64BitELFABI()) {
// VAARG always uses double-word chunks, so promote anything smaller.
setOperationAction(ISD::VAARG, MVT::i1, Promote);
AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);
setOperationAction(ISD::VAARG, MVT::i8, Promote);
AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);
setOperationAction(ISD::VAARG, MVT::i16, Promote);
AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);
setOperationAction(ISD::VAARG, MVT::i32, Promote);
AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);
setOperationAction(ISD::VAARG, MVT::Other, Expand);
} else if (Subtarget.is32BitELFABI()) {
// VAARG is custom lowered with the 32-bit SVR4 ABI.
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::i64, Custom);
} else
setOperationAction(ISD::VAARG, MVT::Other, Expand);
// VACOPY is custom lowered with the 32-bit SVR4 ABI.
if (Subtarget.is32BitELFABI())
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
// Use the default implementation.
setOperationAction(ISD::VAEND , MVT::Other, Expand);
setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
// To handle counter-based loop conditions.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
// Comparisons that require checking two conditions.
if (Subtarget.hasSPE()) {
setCondCodeAction(ISD::SETO, MVT::f32, Expand);
setCondCodeAction(ISD::SETO, MVT::f64, Expand);
setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
if (Subtarget.has64BitSupport()) {
// They also have instructions for converting between i64 and fp.
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
// This is just the low 32 bits of a (signed) fp->i64 conversion.
// We cannot do this with Promote because i64 is not a legal type.
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
} else {
// PowerPC does not have FP_TO_UINT on 32-bit implementations.
if (Subtarget.hasSPE()) {
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
} else
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
// With the instructions enabled under FPCVT, we can do everything.
if (Subtarget.hasFPCVT()) {
if (Subtarget.has64BitSupport()) {
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
if (Subtarget.use64BitRegs()) {
// 64-bit PowerPC implementations can support i64 types directly
addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
// BUILD_PAIR can't be handled natively, and should be expanded to shl/or
setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
// 64-bit PowerPC wants to expand i128 shifts itself.
setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
} else {
// 32-bit PowerPC wants to expand i64 shifts itself.
setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
if (Subtarget.hasVSX()) {
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
if (Subtarget.hasAltivec()) {
for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
setOperationAction(ISD::SADDSAT, VT, Legal);
setOperationAction(ISD::SSUBSAT, VT, Legal);
setOperationAction(ISD::UADDSAT, VT, Legal);
setOperationAction(ISD::USUBSAT, VT, Legal);
// First set operation action for all vector types to expand. Then we
// will selectively turn on ones that can be effectively codegen'd.
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
// add/sub are legal for all supported vector VT's.
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::SUB, VT, Legal);
// For v2i64, these are only valid with P8Vector. This is corrected after
// the loop.
if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
else {
setOperationAction(ISD::SMAX, VT, Expand);
setOperationAction(ISD::SMIN, VT, Expand);
setOperationAction(ISD::UMAX, VT, Expand);
setOperationAction(ISD::UMIN, VT, Expand);
if (Subtarget.hasVSX()) {
setOperationAction(ISD::FMAXNUM, VT, Legal);
setOperationAction(ISD::FMINNUM, VT, Legal);
// Vector instructions introduced in P8
if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
setOperationAction(ISD::CTPOP, VT, Legal);
setOperationAction(ISD::CTLZ, VT, Legal);
else {
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
// Vector instructions introduced in P9
if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
setOperationAction(ISD::CTTZ, VT, Legal);
setOperationAction(ISD::CTTZ, VT, Expand);
// We promote all shuffles to v16i8.
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
// We promote all non-typed operations to v4i32.
setOperationAction(ISD::AND , VT, Promote);
AddPromotedToType (ISD::AND , VT, MVT::v4i32);
setOperationAction(ISD::OR , VT, Promote);
AddPromotedToType (ISD::OR , VT, MVT::v4i32);
setOperationAction(ISD::XOR , VT, Promote);
AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
setOperationAction(ISD::LOAD , VT, Promote);
AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
setOperationAction(ISD::SELECT, VT, Promote);
AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::SELECT_CC, VT, Promote);
AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
setOperationAction(ISD::STORE, VT, Promote);
AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
// No other operations are legal.
setOperationAction(ISD::MUL , VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FNEG, VT, Expand);
setOperationAction(ISD::FSQRT, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FABS, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FCEIL, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
setOperationAction(ISD::FRINT, VT, Expand);
setOperationAction(ISD::FNEARBYINT, VT, Expand);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);
if (!Subtarget.hasP8Vector()) {
setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
setOperationAction(ISD::UMAX, MVT::v2i64, Expand);
setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8})
setOperationAction(ISD::ABS, VT, Custom);
// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
// with merges, splats, etc.
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
// Vector truncates to sub-word integer that fit in an Altivec/VSX register
// are cheap, so handle them before they get expanded to scalar.
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
setOperationAction(ISD::AND , MVT::v4i32, Legal);
setOperationAction(ISD::OR , MVT::v4i32, Legal);
setOperationAction(ISD::XOR , MVT::v4i32, Legal);
setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
setOperationAction(ISD::SELECT, MVT::v4i32,
Subtarget.useCRBits() ? Legal : Expand);
setOperationAction(ISD::STORE , MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
// Without hasP8Altivec set, v2i64 SMAX isn't available.
// But ABS custom lowering requires SMAX support.
if (!Subtarget.hasP8Altivec())
setOperationAction(ISD::ABS, MVT::v2i64, Expand);
// Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
setOperationAction(ISD::ROTL, MVT::v1i128, Custom);
// With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
if (Subtarget.hasAltivec())
for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
setOperationAction(ISD::ROTL, VT, Legal);
// With hasP8Altivec set, we can lower ISD::ROTL to vrld.
if (Subtarget.hasP8Altivec())
setOperationAction(ISD::ROTL, MVT::v2i64, Legal);
addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
setOperationAction(ISD::MUL, MVT::v4f32, Legal);
setOperationAction(ISD::FMA, MVT::v4f32, Legal);
if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
if (Subtarget.hasP8Altivec())
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
// Altivec does not contain unordered floating-point compare instructions
setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
if (Subtarget.hasVSX()) {
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
if (Subtarget.hasP8Vector()) {
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
if (Subtarget.hasDirectMove() && isPPC64) {
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
// The nearbyint variants are not allowed to raise the inexact exception
// so we can only code-gen them with unsafe math.
if (TM.Options.UnsafeFPMath) {
setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
setOperationAction(ISD::FROUND, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
setOperationAction(ISD::FROUND, MVT::f32, Legal);
setOperationAction(ISD::FRINT, MVT::f32, Legal);
setOperationAction(ISD::MUL, MVT::v2f64, Legal);
setOperationAction(ISD::FMA, MVT::v2f64, Legal);
setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
// Share the Altivec comparison restrictions.
setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
setOperationAction(ISD::STORE, MVT::v2f64, Legal);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
if (Subtarget.hasP8Vector())
addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);
addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
if (Subtarget.hasP8Altivec()) {
setOperationAction(ISD::SHL, MVT::v2i64, Legal);
setOperationAction(ISD::SRA, MVT::v2i64, Legal);
setOperationAction(ISD::SRL, MVT::v2i64, Legal);
// 128 bit shifts can be accomplished via 3 instructions for SHL and
// SRL, but not for SRA because of the instructions available:
// VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
// doing
setOperationAction(ISD::SHL, MVT::v1i128, Expand);
setOperationAction(ISD::SRL, MVT::v1i128, Expand);
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
else {
setOperationAction(ISD::SHL, MVT::v2i64, Expand);
setOperationAction(ISD::SRA, MVT::v2i64, Expand);
setOperationAction(ISD::SRL, MVT::v2i64, Expand);
setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
// VSX v2i64 only supports non-arithmetic operations.
setOperationAction(ISD::ADD, MVT::v2i64, Expand);
setOperationAction(ISD::SUB, MVT::v2i64, Expand);
setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
setOperationAction(ISD::STORE, MVT::v2i64, Promote);
AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
// Custom handling for partial vectors of integers converted to
// floating point. We already have optimal handling for v2i32 through
// the DAG combine, so those aren't necessary.
setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
setOperationAction(ISD::FABS, MVT::v4f32, Legal);
setOperationAction(ISD::FABS, MVT::v2f64, Legal);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);
if (Subtarget.hasDirectMove())
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
// Handle constrained floating-point operations of vector.
// The predictor is `hasVSX` because altivec instruction has
// no exception but VSX vector instruction has.
setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
if (Subtarget.hasP8Altivec()) {
addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
if (Subtarget.hasP9Vector()) {
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
// 128 bit shifts can be accomplished via 3 instructions for SHL and
// SRL, but not for SRA because of the instructions available:
// VS{RL} and VS{RL}O.
setOperationAction(ISD::SHL, MVT::v1i128, Legal);
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
setOperationAction(ISD::FADD, MVT::f128, Legal);
setOperationAction(ISD::FSUB, MVT::f128, Legal);
setOperationAction(ISD::FDIV, MVT::f128, Legal);
setOperationAction(ISD::FMUL, MVT::f128, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
// No extending loads to f128 on PPC.
for (MVT FPT : MVT::fp_valuetypes())
setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
setOperationAction(ISD::FMA, MVT::f128, Legal);
setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
setOperationAction(ISD::FRINT, MVT::f128, Legal);
setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
setOperationAction(ISD::FCEIL, MVT::f128, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
setOperationAction(ISD::FROUND, MVT::f128, Legal);
setOperationAction(ISD::SELECT, MVT::f128, Expand);
setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setOperationAction(ISD::BITCAST, MVT::i128, Custom);
// No implementation for these ops for PowerPC.
setOperationAction(ISD::FSIN, MVT::f128, Expand);
setOperationAction(ISD::FCOS, MVT::f128, Expand);
setOperationAction(ISD::FPOW, MVT::f128, Expand);
setOperationAction(ISD::FPOWI, MVT::f128, Expand);
setOperationAction(ISD::FREM, MVT::f128, Expand);
// Handle constrained floating-point operations of fp128
setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FMA, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FRINT, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FFLOOR, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FCEIL, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FTRUNC, MVT::f128, Legal);
setOperationAction(ISD::STRICT_FROUND, MVT::f128, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);
setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
if (Subtarget.hasP9Altivec()) {
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
if (Subtarget.hasQPX()) {
setOperationAction(ISD::FADD, MVT::v4f64, Legal);
setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
setOperationAction(ISD::FREM, MVT::v4f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);
setOperationAction(ISD::LOAD , MVT::v4f64, Custom);
setOperationAction(ISD::STORE , MVT::v4f64, Custom);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);
if (!Subtarget.useCRBits())
setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
setOperationAction(ISD::FABS , MVT::v4f64, Legal);
setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);
setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);
addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
setOperationAction(ISD::FREM, MVT::v4f32, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);
setOperationAction(ISD::LOAD , MVT::v4f32, Custom);
setOperationAction(ISD::STORE , MVT::v4f32, Custom);
if (!Subtarget.useCRBits())
setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);
setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
setOperationAction(ISD::FABS , MVT::v4f32, Legal);
setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);
setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);
addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);
setOperationAction(ISD::AND , MVT::v4i1, Legal);
setOperationAction(ISD::OR , MVT::v4i1, Legal);
setOperationAction(ISD::XOR , MVT::v4i1, Legal);
if (!Subtarget.useCRBits())
setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);
setOperationAction(ISD::LOAD , MVT::v4i1, Custom);
setOperationAction(ISD::STORE , MVT::v4i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);
setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
setOperationAction(ISD::FROUND, MVT::v4f64, Legal);
setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
// These need to set FE_INEXACT, and so cannot be vectorized here.
setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
if (TM.Options.UnsafeFPMath) {
setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
} else {
setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);
setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
// TODO: Handle constrained floating-point operations of v4f64
if (Subtarget.has64BitSupport())
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
if (!isPPC64) {
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
if (Subtarget.hasAltivec()) {
// Altivec instructions set fields to all zeros or all ones.
if (!isPPC64) {
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
// We have target-specific dag combine patterns for the following nodes:
if (Subtarget.hasFPCVT())
if (Subtarget.useCRBits())
if (Subtarget.useCRBits()) {
// Use reciprocal estimates.
if (TM.Options.UnsafeFPMath) {
if (Subtarget.hasP9Altivec()) {
setLibcallName(RTLIB::LOG_F128, "logf128");
setLibcallName(RTLIB::LOG2_F128, "log2f128");
setLibcallName(RTLIB::LOG10_F128, "log10f128");
setLibcallName(RTLIB::EXP_F128, "expf128");
setLibcallName(RTLIB::EXP2_F128, "exp2f128");
setLibcallName(RTLIB::SIN_F128, "sinf128");
setLibcallName(RTLIB::COS_F128, "cosf128");
setLibcallName(RTLIB::POW_F128, "powf128");
setLibcallName(RTLIB::FMIN_F128, "fminf128");
setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
setLibcallName(RTLIB::POWI_F128, "__powikf2");
setLibcallName(RTLIB::REM_F128, "fmodf128");
// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
if (Subtarget.useCRBits()) {
switch (Subtarget.getCPUDirective()) {
default: break;
case PPC::DIR_970:
case PPC::DIR_A2:
case PPC::DIR_E500:
case PPC::DIR_E500mc:
case PPC::DIR_E5500:
case PPC::DIR_PWR4:
case PPC::DIR_PWR5:
case PPC::DIR_PWR5X:
case PPC::DIR_PWR6:
case PPC::DIR_PWR6X:
case PPC::DIR_PWR7:
case PPC::DIR_PWR8:
case PPC::DIR_PWR9:
case PPC::DIR_PWR10:
if (Subtarget.enableMachineScheduler())
// The Freescale cores do better with aggressive inlining of memcpy and
// friends. GCC uses same threshold of 128 bytes (= 32 word stores).
if (Subtarget.getCPUDirective() == PPC::DIR_E500mc ||
Subtarget.getCPUDirective() == PPC::DIR_E5500) {
MaxStoresPerMemset = 32;
MaxStoresPerMemsetOptSize = 16;
MaxStoresPerMemcpy = 32;
MaxStoresPerMemcpyOptSize = 8;
MaxStoresPerMemmove = 32;
MaxStoresPerMemmoveOptSize = 8;
} else if (Subtarget.getCPUDirective() == PPC::DIR_A2) {
// The A2 also benefits from (very) aggressive inlining of memcpy and
// friends. The overhead of a the function call, even when warm, can be
// over one hundred cycles.
MaxStoresPerMemset = 128;
MaxStoresPerMemcpy = 128;
MaxStoresPerMemmove = 128;
MaxLoadsPerMemcmp = 128;
} else {
MaxLoadsPerMemcmp = 8;
MaxLoadsPerMemcmpOptSize = 4;
// Let the subtarget (CPU) decide if a predictable select is more expensive
// than the corresponding branch. This information is used in CGP to decide
// when to convert selects into branches.
PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
if (MaxAlign == MaxMaxAlign)
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
if (MaxMaxAlign >= 32 &&
VTy->getPrimitiveSizeInBits().getFixedSize() >= 256)
MaxAlign = Align(32);
else if (VTy->getPrimitiveSizeInBits().getFixedSize() >= 128 &&
MaxAlign < 16)
MaxAlign = Align(16);
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
Align EltAlign;
getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (auto *EltTy : STy->elements()) {
Align EltAlign;
getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
if (MaxAlign == MaxMaxAlign)
/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
/// function arguments in the caller parameter area.
unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const {
// 16byte and wider vectors are passed on 16byte boundary.
// The rest is 8 on PPC64 and 4 on PPC32 boundary.
Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
if (Subtarget.hasAltivec() || Subtarget.hasQPX())
getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16));
return Alignment.value();
bool PPCTargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
bool PPCTargetLowering::hasSPE() const {
return Subtarget.hasSPE();
bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
return VT.isScalarInteger();
/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific
/// type is cheaper than a multiply followed by a shift.
/// This is true for words and doublewords on 64-bit PowerPC.
bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {
if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) ||
isOperationLegal(ISD::MULHU, Type)))
return true;
return TargetLowering::isMulhCheaperThanMulShift(Type);
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FSEL: return "PPCISD::FSEL";
case PPCISD::FRE: return "PPCISD::FRE";
case PPCISD::CMPB: return "PPCISD::CMPB";
case PPCISD::Hi: return "PPCISD::Hi";
case PPCISD::Lo: return "PPCISD::Lo";
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
case PPCISD::SRL: return "PPCISD::SRL";
case PPCISD::SRA: return "PPCISD::SRA";
case PPCISD::SHL: return "PPCISD::SHL";
case PPCISD::CALL: return "PPCISD::CALL";
case PPCISD::ANDI_rec_1_EQ_BIT:
return "PPCISD::ANDI_rec_1_EQ_BIT";
case PPCISD::ANDI_rec_1_GT_BIT:
return "PPCISD::ANDI_rec_1_GT_BIT";
case PPCISD::VCMP: return "PPCISD::VCMP";
case PPCISD::VCMPo: return "PPCISD::VCMPo";
case PPCISD::LBRX: return "PPCISD::LBRX";
case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
case PPCISD::BDNZ: return "PPCISD::BDNZ";
case PPCISD::BDZ: return "PPCISD::BDZ";
case PPCISD::MFFS: return "PPCISD::MFFS";
case PPCISD::CR6SET: return "PPCISD::CR6SET";
case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
case PPCISD::SC: return "PPCISD::SC";
case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
return nullptr;
EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
EVT VT) const {
if (!VT.isVector())
return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
if (Subtarget.hasQPX())
return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
return VT.changeVectorElementTypeToInteger();
bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
return true;
// Node matching predicates, for use by the tblgen matching code.
/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
static bool isFloatingPointZero(SDValue Op) {
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
return CFP->getValueAPF().isZero();
else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
// Maybe this has already been legalized into the constant pool?
if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
return CFP->getValueAPF().isZero();
return false;
/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
/// true if Op is undef or if it matches the specified value.
static bool isConstantOrUndef(int Op, int Val) {
return Op < 0 || Op == Val;
/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
/// VPKUHUM instruction.
/// The ShuffleKind distinguishes between big-endian operations with
/// two different inputs (0), either-endian operations with two identical
/// inputs (1), and little-endian operations with two different inputs (2).
/// For the latter, the input operands are swapped (see
bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
SelectionDAG &DAG) {
bool IsLE = DAG.getDataLayout().isLittleEndian();
if (ShuffleKind == 0) {
if (IsLE)
return false;
for (unsigned i = 0; i != 16; ++i)
if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
return false;
} else if (ShuffleKind == 2) {
if (!IsLE)
return false;
for (unsigned i = 0; i != 16; ++i)
if (!isConstantOrUndef(N->getMaskElt(i), i*2))
return false;
} else if (ShuffleKind == 1) {
unsigned j = IsLE ? 0 : 1;
for (unsigned i = 0; i != 8; ++i)
if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) ||
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
return false;
return true;
/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
/// VPKUWUM instruction.
/// The ShuffleKind distinguishes between big-endian operations with
/// two different inputs (0), either-endian operations with two identical
/// inputs (1), and little-endian operations with two different inputs (2).
/// For the latter, the input operands are swapped (see
bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
SelectionDAG &DAG) {
bool IsLE = DAG.getDataLayout().isLittleEndian();
if (ShuffleKind == 0) {
if (IsLE)
return false;
for (unsigned i = 0; i != 16; i += 2)
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) ||
!isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
return false;
} else if (ShuffleKind == 2) {
if (!IsLE)
return false;
for (unsigned i = 0; i != 16; i += 2)
if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
!isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
return false;
} else if (ShuffleKind == 1) {
unsigned j = IsLE ? 0 : 2;
for (unsigned i = 0; i != 8; i += 2)
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
return false;
return true;
/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
/// current subtarget.
/// The ShuffleKind distinguishes between big-endian operations with
/// two different inputs (0), either-endian operations with two identical
/// inputs (1), and little-endian operations with two different inputs (2).
/// For the latter, the input operands are swapped (see
bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
SelectionDAG &DAG) {
const PPCSubtarget& Subtarget =
static_cast<const PPCSubtarget&>(DAG.getSubtarget());
if (!Subtarget.hasP8Vector())
return false;
bool IsLE = DAG.getDataLayout().isLittleEndian();
if (ShuffleKind == 0) {
if (IsLE)
return false;
for (unsigned i = 0; i != 16; i += 4)
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) ||
!isConstantOrUndef(N->getMaskElt(i+1), i*2+5) ||
!isConstantOrUndef(N->getMaskElt(i+2), i*2+6) ||
!isConstantOrUndef(N->getMaskElt(i+3), i*2+7))
return false;
} else if (ShuffleKind == 2) {
if (!IsLE)
return false;
for (unsigned i = 0; i != 16; i += 4)
if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
!isConstantOrUndef(N->getMaskElt(i+1), i*2+1) ||
!isConstantOrUndef(N->getMaskElt(i+2), i*2+2) ||
!isConstantOrUndef(N->getMaskElt(i+3), i*2+3))
return false;
} else if (ShuffleKind == 1) {
unsigned j = IsLE ? 0 : 4;
for (unsigned i = 0; i != 8; i += 4)
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
!isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) ||
!isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) ||
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) ||
!isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) ||
!isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
return false;
return true;
/// isVMerge - Common function, used to match vmrg* shuffles.
static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
unsigned LHSStart, unsigned RHSStart) {
if (N->getValueType(0) != MVT::v16i8)
return false;
assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
"Unsupported merge size!");
for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
LHSStart+j+i*UnitSize) ||
return false;
return true;
/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
/// The ShuffleKind distinguishes between big-endian merges with two
/// different inputs (0), either-endian merges with two identical inputs (1),
/// and little-endian merges with two different inputs (2). For the latter,
/// the input operands are swapped (see
bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
unsigned ShuffleKind, SelectionDAG &DAG) {
if (DAG.getDataLayout().isLittleEndian()) {
if (ShuffleKind == 1) // unary
return isVMerge(N, UnitSize, 0, 0);
else if (ShuffleKind == 2) // swapped
return isVMerge(N, UnitSize, 0, 16);
return false;
} else {
if (ShuffleKind == 1) // unary
return isVMerge(N, UnitSize, 8, 8);
else if (ShuffleKind == 0) // normal
return isVMerge(N, UnitSize, 8, 24);
return false;
/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
/// The ShuffleKind distinguishes between big-endian merges with two
/// different inputs (0), either-endian merges with two identical inputs (1),
/// and little-endian merges with two different inputs (2). For the latter,
/// the input operands are swapped (see
bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
unsigned ShuffleKind, SelectionDAG &DAG) {
if (DAG.getDataLayout().isLittleEndian()) {
if (ShuffleKind == 1) // unary
return isVMerge(N, UnitSize, 8, 8);
else if (ShuffleKind == 2) // swapped
return isVMerge(N, UnitSize, 8, 24);
return false;
} else {
if (ShuffleKind == 1) // unary
return isVMerge(N, UnitSize, 0, 0);
else if (ShuffleKind == 0) // normal
return isVMerge(N, UnitSize, 0, 16);
return false;
* Common function used to match vmrgew and vmrgow shuffles
* The indexOffset determines whether to look for even or odd words in
* the shuffle mask. This is based on the of the endianness of the target
* machine.
* - Little Endian:
* - Use offset of 0 to check for odd elements
* - Use offset of 4 to check for even elements
* - Big Endian:
* - Use offset of 0 to check for even elements
* - Use offset of 4 to check for odd elements
* A detailed description of the vector element ordering for little endian and
* big endian can be found at
* Targeting your applications - what little endian and big endian IBM XL C/C++
* compiler differences mean to you
* The mask to the shuffle vector instruction specifies the indices of the
* elements from the two input vectors to place in the result. The elements are
* numbered in array-access order, starting with the first vector. These vectors
* are always of type v16i8, thus each vector will contain 16 elements of size
* 8. More info on the shuffle vector can be found in the
* Language Reference.
* The RHSStartValue indicates whether the same input vectors are used (unary)
* or two different input vectors are used, based on the following:
* - If the instruction uses the same vector for both inputs, the range of the
* indices will be 0 to 15. In this case, the RHSStart value passed should
* be 0.
* - If the instruction has two different vectors then the range of the
* indices will be 0 to 31. In this case, the RHSStart value passed should
* be 16 (indices 0-15 specify elements in the first vector while indices 16
* to 31 specify elements in the second vector).
* \param[in] N The shuffle vector SD Node to analyze
* \param[in] IndexOffset Specifies whether to look for even or odd elements
* \param[in] RHSStartValue Specifies the starting index for the righthand input
* vector to the shuffle_vector instruction
* \return true iff this shuffle vector represents an even or odd word merge
static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
unsigned RHSStartValue) {
if (N->getValueType(0) != MVT::v16i8)
return false;
for (unsigned i = 0; i < 2; ++i)
for (unsigned j = 0; j < 4; ++j)
if (!isConstantOrUndef(N->getMaskElt(i*4+j),
i*RHSStartValue+j+IndexOffset) ||
return false;
return true;
* Determine if the specified shuffle mask is suitable for the vmrgew or
* vmrgow instructions.
* \param[in] N The shuffle vector SD Node to analyze
* \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
* \param[in] ShuffleKind Identify the type of merge:
* - 0 = big-endian merge with two different inputs;
* - 1 = either-endian merge with two identical inputs;
* - 2 = little-endian merge with two different inputs (inputs are swapped for
* little-endian merges).
* \param[in] DAG The current SelectionDAG
* \return true iff this shuffle mask
bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
unsigned ShuffleKind, SelectionDAG &DAG) {
if (DAG.getDataLayout().isLittleEndian()) {
unsigned indexOffset = CheckEven ? 4 : 0;
if (ShuffleKind == 1) // Unary
return isVMerge(N, indexOffset, 0);
else if (ShuffleKind == 2) // swapped
return isVMerge(N, indexOffset, 16);
return false;
else {
unsigned indexOffset = CheckEven ? 0 : 4;
if (ShuffleKind == 1) // Unary
return isVMerge(N, indexOffset, 0);
else if (ShuffleKind == 0) // Normal
return isVMerge(N, indexOffset, 16);
return false;
return false;
/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
/// amount, otherwise return -1.
/// The ShuffleKind distinguishes between big-endian operations with two
/// different inputs (0), either-endian operations with two identical inputs
/// (1), and little-endian operations with two different inputs (2). For the
/// latter, the input operands are swapped (see
int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
SelectionDAG &DAG) {
if (N->getValueType(0) != MVT::v16i8)
return -1;
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
// Find the first non-undef value in the shuffle mask.
unsigned i;
for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
if (i == 16) return -1; // all undef.
// Otherwise, check to see if the rest of the elements are consecutively
// numbered from this value.
unsigned ShiftAmt = SVOp->getMaskElt(i);
if (ShiftAmt < i) return -1;
ShiftAmt -= i;
bool isLE = DAG.getDataLayout().isLittleEndian();
if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
// Check the rest of the elements to see if they are consecutive.
for (++i; i != 16; ++i)
if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
return -1;
} else if (ShuffleKind == 1) {
// Check the rest of the elements to see if they are consecutive.
for (++i; i != 16; ++i)
if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
return -1;
} else
return -1;
if (isLE)
ShiftAmt = 16 - ShiftAmt;
return ShiftAmt;
/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a splat of a single element that is suitable for input to
/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
// The consecutive indices need to specify an element, not part of two
// different elements. So abandon ship early if this isn't the case.
if (N->getMaskElt(0) % EltSize != 0)
return false;
// This is a splat operation if each element of the permute is the same, and
// if the value doesn't reference the second vector.
unsigned ElementBase = N->getMaskElt(0);
// FIXME: Handle UNDEF elements too!
if (ElementBase >= 16)
return false;
// Check that the indices are consecutive, in the case of a multi-byte element
// splatted with a v16i8 mask.
for (unsigned i = 1; i != EltSize; ++i)
if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
return false;
for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
if (N->getMaskElt(i) < 0) continue;
for (unsigned j = 0; j != EltSize; ++j)
if (N->getMaskElt(i+j) != N->getMaskElt(j))
return false;
return true;
/// Check that the mask is shuffling N byte elements. Within each N byte
/// element of the mask, the indices could be either in increasing or
/// decreasing order as long as they are consecutive.
/// \param[in] N the shuffle vector SD Node to analyze
/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
/// Word/DoubleWord/QuadWord).
/// \param[in] StepLen the delta indices number among the N byte element, if
/// the mask is in increasing/decreasing order then it is 1/-1.
/// \return true iff the mask is shuffling N byte elements.
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
int StepLen) {
assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
"Unexpected element width.");
assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
unsigned NumOfElem = 16 / Width;
unsigned MaskVal[16]; // Width is never greater than 16
for (unsigned i = 0; i < NumOfElem; ++i) {
MaskVal[0] = N->getMaskElt(i * Width);
if ((StepLen == 1) && (MaskVal[0] % Width)) {
return false;
} else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
return false;
for (unsigned int j = 1; j < Width; ++j) {
MaskVal[j] = N->getMaskElt(i * Width + j);
if (MaskVal[j] != MaskVal[j-1] + StepLen) {
return false;
return true;
bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
unsigned &InsertAtByte, bool &Swap, bool IsLE) {
if (!isNByteElemShuffleMask(N, 4, 1))
return false;
// Now we look at mask elements 0,4,8,12
unsigned M0 = N->getMaskElt(0) / 4;
unsigned M1 = N->getMaskElt(4) / 4;
unsigned M2 = N->getMaskElt(8) / 4;
unsigned M3 = N->getMaskElt(12) / 4;
unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
// Below, let H and L be arbitrary elements of the shuffle mask
// where H is in the range [4,7] and L is in the range [0,3].
// H, 1, 2, 3 or L, 5, 6, 7
if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
(M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
InsertAtByte = IsLE ? 12 : 0;
Swap = M0 < 4;
return true;
// 0, H, 2, 3 or 4, L, 6, 7
if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
(M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
InsertAtByte = IsLE ? 8 : 4;
Swap = M1 < 4;
return true;
// 0, 1, H, 3 or 4, 5, L, 7
if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
(M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
InsertAtByte = IsLE ? 4 : 8;
Swap = M2 < 4;
return true;
// 0, 1, 2, H or 4, 5, 6, L
if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
(M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
InsertAtByte = IsLE ? 0 : 12;
Swap = M3 < 4;
return true;
// If both vector operands for the shuffle are the same vector, the mask will
// contain only elements from the first one and the second one will be undef.
if (N->getOperand(1).isUndef()) {
ShiftElts = 0;
Swap = true;
unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
InsertAtByte = IsLE ? 12 : 0;
return true;
if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
InsertAtByte = IsLE ? 8 : 4;
return true;
if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
InsertAtByte = IsLE ? 4 : 8;
return true;
if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
InsertAtByte = IsLE ? 0 : 12;
return true;
return false;
bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
bool &Swap, bool IsLE) {
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
// Ensure each byte index of the word is consecutive.
if (!isNByteElemShuffleMask(N, 4, 1))
return false;
// Now we look at mask elements 0,4,8,12, which are the beginning of words.
unsigned M0 = N->getMaskElt(0) / 4;
unsigned M1 = N->getMaskElt(4) / 4;
unsigned M2 = N->getMaskElt(8) / 4;
unsigned M3 = N->getMaskElt(12) / 4;
// If both vector operands for the shuffle are the same vector, the mask will
// contain only elements from the first one and the second one will be undef.
if (N->getOperand(1).isUndef()) {
assert(M0 < 4 && "Indexing into an undef vector?");
if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
return false;
ShiftElts = IsLE ? (4 - M0) % 4 : M0;
Swap = false;
return true;
// Ensure each word index of the ShuffleVector Mask is consecutive.
if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
return false;
if (IsLE) {
if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
// Input vectors don't need to be swapped if the leading element
// of the result is one of the 3 left elements of the second vector
// (or if there is no shift to be done at all).
Swap = false;
ShiftElts = (8 - M0) % 8;
} else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
// Input vectors need to be swapped if the leading element
// of the result is one of the 3 left elements of the first vector
// (or if we're shifting by 4 - thereby simply swapping the vectors).
Swap = true;
ShiftElts = (4 - M0) % 4;
return true;
} else { // BE
if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
// Input vectors don't need to be swapped if the leading element
// of the result is one of the 4 elements of the first vector.
Swap = false;
ShiftElts = M0;
} else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
// Input vectors need to be swapped if the leading element
// of the result is one of the 4 elements of the right vector.
Swap = true;
ShiftElts = M0 - 4;
return true;
bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
if (!isNByteElemShuffleMask(N, Width, -1))
return false;
for (int i = 0; i < 16; i += Width)
if (N->getMaskElt(i) != i + Width - 1)
return false;
return true;
bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
return isXXBRShuffleMaskHelper(N, 2);
bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
return isXXBRShuffleMaskHelper(N, 4);
bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
return isXXBRShuffleMaskHelper(N, 8);
bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
return isXXBRShuffleMaskHelper(N, 16);
/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
/// if the inputs to the instruction should be swapped and set \p DM to the
/// value for the immediate.
/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
/// AND element 0 of the result comes from the first input (LE) or second input
/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
/// mask.
bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
bool &Swap, bool IsLE) {
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
// Ensure each byte index of the double word is consecutive.
if (!isNByteElemShuffleMask(N, 8, 1))
return false;
unsigned M0 = N->getMaskElt(0) / 8;
unsigned M1 = N->getMaskElt(8) / 8;
assert(((M0 | M1) < 4) && "A mask element out of bounds?");
// If both vector operands for the shuffle are the same vector, the mask will
// contain only elements from the first one and the second one will be undef.
if (N->getOperand(1).isUndef()) {
if ((M0 | M1) < 2) {
DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
Swap = false;
return true;
} else
return false;
if (IsLE) {
if (M0 > 1 && M1 < 2) {
Swap = false;
} else if (M0 < 2 && M1 > 1) {
M0 = (M0 + 2) % 4;
M1 = (M1 + 2) % 4;
Swap = true;
} else
return false;
// Note: if control flow comes here that means Swap is already set above
DM = (((~M1) & 1) << 1) + ((~M0) & 1);
return true;
} else { // BE
if (M0 < 2 && M1 > 1) {
Swap = false;
} else if (M0 > 1 && M1 < 2) {
M0 = (M0 + 2) % 4;
M1 = (M1 + 2) % 4;
Swap = true;
} else
return false;
// Note: if control flow comes here that means Swap is already set above
DM = (M0 << 1) + (M1 & 1);
return true;
/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
/// appropriate for PPC mnemonics (which have a big endian bias - namely
/// elements are counted from the left of the vector register).
unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
assert(isSplatShuffleMask(SVOp, EltSize));
if (DAG.getDataLayout().isLittleEndian())
return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
return SVOp->getMaskElt(0) / EltSize;
/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
/// by using a vspltis[bhw] instruction of the specified element size, return
/// the constant being splatted. The ByteSize field indicates the number of
/// bytes of each element [124] -> [bhw].
SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
SDValue OpVal(nullptr, 0);
// If ByteSize of the splat is bigger than the element size of the
// build_vector, then we have a case where we are checking for a splat where
// multiple elements of the buildvector are folded together into a single
// logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
unsigned EltSize = 16/N->getNumOperands();
if (EltSize < ByteSize) {
unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
SDValue UniquedVals[4];
assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
// See if all of the elements in the buildvector agree across.
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
if (N->getOperand(i).isUndef()) continue;
// If the element isn't a constant, bail fully out.
if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
if (!UniquedVals[i&(Multiple-1)].getNode())
UniquedVals[i&(Multiple-1)] = N->getOperand(i);
else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
return SDValue(); // no match.
// Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
// either constant or undef values that are identical for each chunk. See
// if these chunks can form into a larger vspltis*.
// Check to see if all of the leading entries are either 0 or -1. If
// neither, then this won't fit into the immediate field.
bool LeadingZero = true;
bool LeadingOnes = true;
for (unsigned i = 0; i != Multiple-1; ++i) {
if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
LeadingZero &= isNullConstant(UniquedVals[i]);
LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
// Finally, check the least significant entry.
if (LeadingZero) {
if (!UniquedVals[Multiple-1].getNode())
return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
if (Val < 16) // 0,0,0,4 -> vspltisw(4)
return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
if (LeadingOnes) {
if (!UniquedVals[Multiple-1].getNode())
return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
return SDValue();
// Check to see if this buildvec has a single non-undef value in its elements.
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
if (N->getOperand(i).isUndef()) continue;
if (!OpVal.getNode())
OpVal = N->getOperand(i);
else if (OpVal != N->getOperand(i))
return SDValue();
if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
unsigned ValSizeInBytes = EltSize;
uint64_t Value = 0;
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
Value = CN->getZExtValue();
} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
Value = FloatToBits(CN->getValueAPF().convertToFloat());
// If the splat value is larger than the element value, then we can never do
// this splat. The only case that we could fit the replicated bits into our
// immediate field for would be zero, and we prefer to use vxor for it.
if (ValSizeInBytes < ByteSize) return SDValue();
// If the element value is larger than the splat value, check if it consists
// of a repeated bit pattern of size ByteSize.
if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
return SDValue();
// Properly sign extend the value.
int MaskVal = SignExtend32(Value, ByteSize * 8);
// If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
if (MaskVal == 0) return SDValue();
// Finally, if this value fits in a 5 bit sext field, return it
if (SignExtend32<5>(MaskVal) == MaskVal)
return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
return SDValue();
/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
/// amount, otherwise return -1.
int PPC::isQVALIGNIShuffleMask(SDNode *N) {
EVT VT = N->getValueType(0);
if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
return -1;
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
// Find the first non-undef value in the shuffle mask.
unsigned i;
for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
if (i == 4) return -1; // all undef.
// Otherwise, check to see if the rest of the elements are consecutively
// numbered from this value.
unsigned ShiftAmt = SVOp->getMaskElt(i);
if (ShiftAmt < i) return -1;
ShiftAmt -= i;
// Check the rest of the elements to see if they are consecutive.
for (++i; i != 4; ++i)
if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
return -1;
return ShiftAmt;
// Addressing Mode Selection
/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
/// or 64-bit immediate, and if the value can be accurately represented as a
/// sign extension from a 16-bit value. If so, this returns true and the
/// immediate.
bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
if (!isa<ConstantSDNode>(N))
return false;
Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
if (N->getValueType(0) == MVT::i32)
return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
return isIntS16Immediate(Op.getNode(), Imm);
/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
/// be represented as an indexed [r+r] operation.
bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
SDValue &Index,
SelectionDAG &DAG) const {
for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
UI != E; ++UI) {
if (MemSDNode *Memop = dyn_cast<MemSDNode>(*UI)) {
if (Memop->getMemoryVT() == MVT::f64) {
Base = N.getOperand(0);
Index = N.getOperand(1);
return true;
return false;
/// SelectAddressRegReg - Given the specified addressed, check to see if it
/// can be represented as an indexed [r+r] operation. Returns false if it
/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
/// non-zero and N can be represented by a base register plus a signed 16-bit
/// displacement, make a more precise judgement by checking (displacement % \p
/// EncodingAlignment).
bool PPCTargetLowering::SelectAddressRegReg(
SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
MaybeAlign EncodingAlignment) const {
// If we have a PC Relative target flag don't select as [reg+reg]. It will be
// a [pc+imm].
if (SelectAddressPCRel(N, Base))
return false;
int16_t Imm = 0;
if (N.getOpcode() == ISD::ADD) {
// Is there any SPE load/store (f64), which can't handle 16bit offset?
// SPE load/store can only handle 8-bit offsets.
if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
return true;
if (isIntS16Immediate(N.getOperand(1), Imm) &&
(!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
return false; // r+i
if (N.getOperand(1).getOpcode() == PPCISD::Lo)
return false; // r+i
Base = N.getOperand(0);
Index = N.getOperand(1);
return true;
} else if (N.getOpcode() == ISD::OR) {
if (isIntS16Immediate(N.getOperand(1), Imm) &&
(!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
return false; // r+i can fold it if we can.
// If this is an or of disjoint bitfields, we can codegen this as an add
// (for better address arithmetic) if the LHS and RHS of the OR are provably
// disjoint.
KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
if (LHSKnown.Zero.getBoolValue()) {
KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
// If all of the bits are known zero on the LHS or RHS, the add won't
// carry.
if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
Base = N.getOperand(0);
Index = N.getOperand(1);
return true;
return false;
// If we happen to be doing an i64 load or store into a stack slot that has
// less than a 4-byte alignment, then the frame-index elimination may need to
// use an indexed load or store instruction (because the offset may not be a
// multiple of 4). The extra register needed to hold the offset comes from the
// register scavenger, and it is possible that the scavenger will need to use
// an emergency spill slot. As a result, we need to make sure that a spill slot
// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
// stack slot.
static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
// FIXME: This does not handle the LWA case.
if (VT != MVT::i64)
// NOTE: We'll exclude negative FIs here, which come from argument
// lowering, because there are no known test cases triggering this problem
// using packed structures (or similar). We can remove this exclusion if
// we find such a test case. The reason why this is so test-case driven is
// because this entire 'fixup' is only to prevent crashes (from the
// register scavenger) on not-really-valid inputs. For example, if we have:
// %a = alloca i1
// %b = bitcast i1* %a to i64*
// store i64* a, i64 b
// then the store should really be marked as 'align 1', but is not. If it
// were marked as 'align 1' then the indexed form would have been
// instruction-selected initially, and the problem this 'fixup' is preventing
// won't happen regardless.
if (FrameIdx < 0)
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
if (MFI.getObjectAlign(FrameIdx) >= Align(4))
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
/// Returns true if the address N can be represented by a base register plus
/// a signed 16-bit displacement [r+imm], and if it is not better
/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
/// displacements that are multiples of that value.
bool PPCTargetLowering::SelectAddressRegImm(
SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
MaybeAlign EncodingAlignment) const {
// FIXME dl should come from parent load or store, not from address
SDLoc dl(N);
// If we have a PC Relative target flag don't select as [reg+imm]. It will be
// a [pc+imm].
if (SelectAddressPCRel(N, Base))
return false;
// If this can be more profitably realized as r+r, fail.
if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
return false;
if (N.getOpcode() == ISD::ADD) {
int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
(!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
} else {
Base = N.getOperand(0);
return true; // [r+i]
} else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
// Match LOAD (ADD (X, Lo(G))).
&& "Cannot handle constant offsets yet!");
Disp = N.getOperand(1).getOperand(0); // The global address.
assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
Disp.getOpcode() == ISD::TargetConstantPool ||
Disp.getOpcode() == ISD::TargetJumpTable);
Base = N.getOperand(0);
return true; // [&g+r]
} else if (N.getOpcode() == ISD::OR) {
int16_t imm = 0;
if (isIntS16Immediate(N.getOperand(1), imm) &&
(!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
// If this is an or of disjoint bitfields, we can codegen this as an add
// (for better address arithmetic) if the LHS and RHS of the OR are
// provably disjoint.
KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
// If all of the bits are known zero on the LHS or RHS, the add won't
// carry.
if (FrameIndexSDNode *FI =
dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
} else {
Base = N.getOperand(0);
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
return true;
} else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
// Loading from a constant address.
// If this address fits entirely in a 16-bit sext immediate field, codegen
// this as "d, 0"
int16_t Imm;
if (isIntS16Immediate(CN, Imm) &&
(!EncodingAlignment || isAligned(*EncodingAlignment, Imm))) {
Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
return true;
// Handle 32-bit sext immediates with LIS + addr mode.
if ((CN->getValueType(0) == MVT::i32 ||
(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
(!EncodingAlignment ||
isAligned(*EncodingAlignment, CN->getZExtValue()))) {
int Addr = (int)CN->getZExtValue();
// Otherwise, break this down into an LIS + disp.
Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);
Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
return true;
Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
} else
Base = N;
return true; // [r+0]
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
/// represented as an indexed [r+r] operation.
bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
SDValue &Index,
SelectionDAG &DAG) const {
// Check to see if we can easily represent this as an [r+r] address. This
// will fail if it thinks that the address is more profitably represented as
// reg+imm, e.g. where imm = 0.
if (SelectAddressRegReg(N, Base, Index, DAG))
return true;
// If the address is the result of an add, we will utilize the fact that the
// address calculation includes an implicit add. However, we can reduce
// register pressure if we do not materialize a constant just for use as the
// index register. We only get rid of the add if it is not an add of a
// value and a 16-bit signed constant and both have a single use.
int16_t imm = 0;
if (N.getOpcode() == ISD::ADD &&
(!isIntS16Immediate(N.getOperand(1), imm) ||
!N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
Base = N.getOperand(0);
Index = N.getOperand(1);
return true;
// Otherwise, do it the hard way, using R0 as the base register.
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
Index = N;
return true;
template <typename Ty> static bool isValidPCRelNode(SDValue N) {
Ty *PCRelCand = dyn_cast<Ty>(N);
return PCRelCand && (PCRelCand->getTargetFlags() & PPCII::MO_PCREL_FLAG);
/// Returns true if this address is a PC Relative address.
/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
// This is a materialize PC Relative node. Always select this as PC Relative.
Base = N;
if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
return true;
if (isValidPCRelNode<ConstantPoolSDNode>(N) ||
isValidPCRelNode<GlobalAddressSDNode>(N) ||
isValidPCRelNode<JumpTableSDNode>(N) ||
return true;
return false;
/// Returns true if we should use a direct load into vector instruction
/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
// If there are any other uses other than scalar to vector, then we should
// keep it as a scalar load -> direct move pattern to prevent multiple
// loads.
LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
if (!LD)
return false;
EVT MemVT = LD->getMemoryVT();
if (!MemVT.isSimple())
return false;
switch(MemVT.getSimpleVT().SimpleTy) {
case MVT::i64:
case MVT::i32:
if (!ST.hasP8Vector())
return false;
case MVT::i16:
case MVT::i8:
if (!ST.hasP9Vector())
return false;
return false;
SDValue LoadedVal(N, 0);
if (!LoadedVal.hasOneUse())
return false;
for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
UI != UE; ++UI)
if (UI.getUse().get().getResNo() == 0 &&
UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&
return false;
return true;
/// getPreIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if the node's address
/// can be legally represented as pre-indexed load / store address.
bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const {
if (DisablePPCPreinc) return false;
bool isLoad = true;
SDValue Ptr;
unsigned Alignment;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
Ptr = LD->getBasePtr();
VT = LD->getMemoryVT();
Alignment = LD->getAlignment();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
Ptr = ST->getBasePtr();
VT = ST->getMemoryVT();
Alignment = ST->getAlignment();
isLoad = false;
} else
return false;
// Do not generate pre-inc forms for specific loads that feed scalar_to_vector
// instructions because we can fold these into a more efficient instruction
// instead, (such as LXSD).
if (isLoad && usePartialVectorLoads(N, Subtarget)) {
return false;
// PowerPC doesn't have preinc load/store instructions for vectors (except
// for QPX, which does have preinc r+r forms).
if (VT.isVector()) {
if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) {
return false;
} else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
return true;
if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
// Common code will reject creating a pre-inc form if the base pointer
// is a frame index, or if N is a store and the base pointer is either
// the same as or a predecessor of the value being stored. Check for
// those situations here, and try with swapped Base/Offset instead.
bool Swap = false;
if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
Swap = true;
else if (!isLoad) {
SDValue Val = cast<StoreSDNode>(N)->getValue();
if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
Swap = true;
if (Swap)
std::swap(Base, Offset);
return true;
// LDU/STU can only handle immediates that are a multiple of 4.
if (VT != MVT::i64) {
if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, None))
return false;
} else {
// LDU/STU need an address with at least 4-byte alignment.
if (Alignment < 4)
return false;
if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))
return false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
// PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
// sext i32 to i64 when addr mode is r+i.
if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
LD->getExtensionType() == ISD::SEXTLOAD &&
return false;
return true;
// LowerOperation implementation
/// Return true if we should reference labels using a PICBase, set the HiOpFlags
/// and LoOpFlags to the target MO flags.
static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
unsigned &HiOpFlags, unsigned &LoOpFlags,
const GlobalValue *GV = nullptr) {
HiOpFlags = PPCII::MO_HA;
LoOpFlags = PPCII::MO_LO;
// Don't use the pic base if not in PIC relocation model.
if (IsPIC) {
static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
SelectionDAG &DAG) {
SDLoc DL(HiPart);
EVT PtrVT = HiPart.getValueType();
SDValue Zero = DAG.getConstant(0, DL, PtrVT);
SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
// With PIC, the first instruction is actually "GR+hi(&G)".
if (isPIC)
Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
// Generate non-pic code that has direct accesses to the constant pool.
// The address of the global is just (hi(&g)+lo(&g)).
return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
static void setUsesTOCBasePtr(MachineFunction &MF) {
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
static void setUsesTOCBasePtr(SelectionDAG &DAG) {
SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
SDValue GA) const {
const bool Is64Bit = Subtarget.isPPC64();
EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)
: Subtarget.isAIXABI()
? DAG.getRegister(PPC::R2, VT)
: DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
SDValue Ops[] = { GA, Reg };
return DAG.getMemIntrinsicNode(
PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
MachinePointerInfo::getGOT(DAG.getMachineFunction()), None,
SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = Op.getValueType();
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
const Constant *C = CP->getConstVal();
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
if (Subtarget.isUsingPCRelativeCalls()) {
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue ConstPool = DAG.getTargetConstantPool(
C, Ty, CP->getAlign(), CP->getOffset(), PPCII::MO_PCREL_FLAG);
return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, ConstPool);
SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0);
return getTOCEntry(DAG, SDLoc(CP), GA);
unsigned MOHiFlag, MOLoFlag;
bool IsPIC = isPositionIndependent();
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
if (IsPIC && Subtarget.isSVR4ABI()) {
SDValue GA =
DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), PPCII::MO_PIC_FLAG);
return getTOCEntry(DAG, SDLoc(CP), GA);
SDValue CPIHi =
DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOHiFlag);
SDValue CPILo =
DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOLoFlag);
return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
// For 64-bit PowerPC, prefer the more compact relative encodings.
// This trades 32 bits per jump table entry for one or two instructions
// on the jump site.
unsigned PPCTargetLowering::getJumpTableEncoding() const {
if (isJumpTableRelative())
return MachineJumpTableInfo::EK_LabelDifference32;
return TargetLowering::getJumpTableEncoding();
bool PPCTargetLowering::isJumpTableRelative() const {
if (UseAbsoluteJumpTables)
return false;
if (Subtarget.isPPC64() || Subtarget.isAIXABI())
return true;
return TargetLowering::isJumpTableRelative();
SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
switch (getTargetMachine().getCodeModel()) {
case CodeModel::Small:
case CodeModel::Medium:
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
const MCExpr *
PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
unsigned JTI,
MCContext &Ctx) const {
if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
switch (getTargetMachine().getCodeModel()) {
case CodeModel::Small:
case CodeModel::Medium:
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
EVT PtrVT = Op.getValueType();
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
// isUsingPCRelativeCalls() returns true when PCRelative is enabled
if (Subtarget.isUsingPCRelativeCalls()) {
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue GA =
DAG.getTargetJumpTable(JT->getIndex(), Ty, PPCII::MO_PCREL_FLAG);
SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
return MatAddr;
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
return getTOCEntry(DAG, SDLoc(JT), GA);
unsigned MOHiFlag, MOLoFlag;
bool IsPIC = isPositionIndependent();
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
if (IsPIC && Subtarget.isSVR4ABI()) {
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
return getTOCEntry(DAG, SDLoc(GA), GA);
SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = Op.getValueType();
BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
const BlockAddress *BA = BASDN->getBlockAddress();
// isUsingPCRelativeCalls() returns true when PCRelative is enabled
if (Subtarget.isUsingPCRelativeCalls()) {
EVT Ty = getPointerTy(DAG.getDataLayout());
SDValue GA = DAG.getTargetBlockAddress(BA, Ty, BASDN->getOffset(),
SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
return MatAddr;
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
// The actual BlockAddress is stored in the TOC.
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
return getTOCEntry(DAG, SDLoc(BASDN), GA);
// 32-bit position-independent ELF stores the BlockAddress in the .got.
if (Subtarget.is32BitELFABI() && isPositionIndependent())
return getTOCEntry(
DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));
unsigned MOHiFlag, MOLoFlag;
bool IsPIC = isPositionIndependent();
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
// FIXME: TLS addresses currently use medium model code sequences,
// which is the most useful form. Eventually support for small and
// large models could be added if users need it, at the cost of
// additional complexity.
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
SDLoc dl(GA);
const GlobalValue *GV = GA->getGlobal();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
bool is64bit = Subtarget.isPPC64();
const Module *M = DAG.getMachineFunction().getFunction().getParent();
PICLevel::Level picLevel = M->getPICLevel();
const TargetMachine &TM = getTargetMachine();
TLSModel::Model Model = TM.getTLSModel(GV);
if (Model == TLSModel::LocalExec) {
SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
: DAG.getRegister(PPC::R2, MVT::i32);
SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
if (Model == TLSModel::InitialExec) {
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
SDValue GOTPtr;
if (is64bit) {
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
PtrVT, GOTReg, TGA);
} else {
if (!TM.isPositionIndependent())
GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
else if (picLevel == PICLevel::SmallPIC)
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
PtrVT, TGA, GOTPtr);
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
if (Model == TLSModel::GeneralDynamic) {
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
SDValue GOTPtr;
if (is64bit) {
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
} else {
if (picLevel == PICLevel::SmallPIC)
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
if (Model == TLSModel::LocalDynamic) {
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
SDValue GOTPtr;
if (is64bit) {
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
} else {
if (picLevel == PICLevel::SmallPIC)
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
PtrVT, TLSAddr, TGA);
return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
llvm_unreachable("Unknown TLS model!");
SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = Op.getValueType();
GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GSDN->getGlobal();
// 64-bit SVR4 ABI & AIX ABI code is always position-independent.
// The actual address of the GlobalValue is stored in the TOC.
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
if (Subtarget.isUsingPCRelativeCalls()) {
EVT Ty = getPointerTy(DAG.getDataLayout());
if (isAccessedAsGotIndirect(Op)) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,
return Load;
} else {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
return getTOCEntry(DAG, DL, GA);
unsigned MOHiFlag, MOLoFlag;
bool IsPIC = isPositionIndependent();
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
if (IsPIC && Subtarget.isSVR4ABI()) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
return getTOCEntry(DAG, DL, GA);
SDValue GAHi =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
SDValue GALo =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
return LowerLabelRef(GAHi, GALo, IsPIC, DAG);
SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDLoc dl(Op);
if (Op.getValueType() == MVT::v2i64) {
// When the operands themselves are v2i64 values, we need to do something
// special because VSX has no underlying comparison operations for these.
if (Op.getOperand(0).getValueType() == MVT::v2i64) {
// Equality can be handled by casting to the legal type for Altivec
// comparisons, everything else needs to be expanded.
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
DAG.getSetCC(dl, MVT::v4i32,
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
return SDValue();
// We handle most of these in the usual way.
return Op;
// If we're comparing for equality to zero, expose the fact that this is
// implemented as a ctlz/srl pair on ppc, so that the dag combiner can
// fold the new nodes.
if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
return V;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
// Leave comparisons against 0 and -1 alone for now, since they're usually
// optimized. FIXME: revisit this when we can custom lower all setcc
// optimizations.
if (C->isAllOnesValue() || C->isNullValue())
return SDValue();
// If we have an integer seteq/setne, turn it into a compare against zero
// by xor'ing the rhs with the lhs, which is faster than setting a
// condition register, reading it back out, and masking the correct bit. The
// normal approach here uses sub to do this instead of xor. Using xor exposes
// the result to other bit-twiddling opportunities.
EVT LHSVT = Op.getOperand(0).getValueType();
if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
EVT VT = Op.getValueType();
SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
return SDValue();
SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDNode *Node = Op.getNode();
EVT VT = Node->getValueType(0);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue InChain = Node->getOperand(0);
SDValue VAListPtr = Node->getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
SDLoc dl(Node);
assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
// gpr_index
SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
VAListPtr, MachinePointerInfo(SV), MVT::i8);
InChain = GprIndex.getValue(1);
if (VT == MVT::i64) {
// Check if GprIndex is even
SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
DAG.getConstant(1, dl, MVT::i32));
SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
DAG.getConstant(1, dl, MVT::i32));
// Align GprIndex to be even if it isn't
GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
// fpr index is 1 byte after gpr
SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
DAG.getConstant(1, dl, MVT::i32));
// fpr
SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
FprPtr, MachinePointerInfo(SV), MVT::i8);
InChain = FprIndex.getValue(1);
SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
DAG.getConstant(8, dl, MVT::i32));
SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
DAG.getConstant(4, dl, MVT::i32));
// areas
SDValue OverflowArea =
DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
InChain = OverflowArea.getValue(1);
SDValue RegSaveArea =
DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
InChain = RegSaveArea.getValue(1);
// select overflow_area if index > 8
SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);
// adjustment constant gpr_index * 4/8
SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
VT.isInteger() ? GprIndex : FprIndex,
DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
// OurReg = RegSaveArea + RegConstant
SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
// Floating types are 32 bytes into RegSaveArea
if (VT.isFloatingPoint())
OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
DAG.getConstant(32, dl, MVT::i32));
// increase {f,g}pr_index by 1 (or 2 if VT is i64)
SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
VT.isInteger() ? GprIndex : FprIndex,
DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
VT.isInteger() ? VAListPtr : FprPtr,
MachinePointerInfo(SV), MVT::i8);
// determine if we should load from reg_save_area or overflow_area
SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
// increase overflow_area by 4/8 if gpr/fpr > 8
SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
DAG.getConstant(VT.isInteger() ? 4 : 8,
dl, MVT::i32));
OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
MachinePointerInfo(), MVT::i32);
return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
// We have to copy the entire va_list struct:
// 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2),
DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8),
false, true, false, MachinePointerInfo(),
SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
SelectionDAG &DAG) const {
if (Subtarget.isAIXABI())
report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");
return Op.getOperand(0);
SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SelectionDAG &DAG) const {
if (Subtarget.isAIXABI())
report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");
SDValue Chain = Op.getOperand(0);
SDValue Trmp = Op.getOperand(1); // trampoline
SDValue FPtr = Op.getOperand(2); // nested function
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
bool isPPC64 = (PtrVT == MVT::i64);
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Ty = IntPtrTy;
Entry.Node = Trmp; Args.push_back(Entry);
// TrampSize == (isPPC64 ? 48 : 40);
Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
isPPC64 ? MVT::i64 : MVT::i32);
Entry.Node = FPtr; Args.push_back(Entry);
Entry.Node = Nest; Args.push_back(Entry);
// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
TargetLowering::CallLoweringInfo CLI(DAG);
CallingConv::C, Type::getVoidTy(*DAG.getContext()),
DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.second;
SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
EVT PtrVT = getPointerTy(MF.getDataLayout());
SDLoc dl(Op);
if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
// For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
// We suppose the given va_list is already allocated.
// typedef struct {
// char gpr; /* index into the array of 8 GPRs
// * stored in the register save area
// * gpr=0 corresponds to r3,
// * gpr=1 to r4, etc.
// */
// char fpr; /* index into the array of 8 FPRs
// * stored in the register save area
// * fpr=0 corresponds to f1,
// * fpr=1 to f2, etc.
// */
// char *overflow_arg_area;
// /* location on stack that holds
// * the next overflow argument
// */
// char *reg_save_area;
// /* where r3:r10 and f1:f8 (if saved)
// * are stored
// */
// } va_list[1];
SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);
uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);
uint64_t FPROffset = 1;
SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
// Store first byte : number of int regs
SDValue firstStore =
DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
MachinePointerInfo(SV), MVT::i8);
uint64_t nextOffset = FPROffset;
SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
// Store second byte : number of float regs
SDValue secondStore =
DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
MachinePointerInfo(SV, nextOffset), MVT::i8);
nextOffset += StackOffset;
nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
// Store second word : arguments given on stack
SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
MachinePointerInfo(SV, nextOffset));
nextOffset += FrameOffset;
nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
// Store third word : arguments given in registers
return DAG.getStore(thirdStore, dl, FR, nextPtr,
MachinePointerInfo(SV, nextOffset));
/// FPR - The set of FP registers that should be allocated for arguments
/// on Darwin and AIX.
static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
PPC::F11, PPC::F12, PPC::F13};
/// QFPR - The set of QPX registers that should be allocated for arguments.
static const MCPhysReg QFPR[] = {
PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};
/// CalculateStackSlotSize - Calculates the size reserved for this argument on
/// the stack.
static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
unsigned PtrByteSize) {
unsigned ArgSize = ArgVT.getStoreSize();
if (Flags.isByVal())
ArgSize = Flags.getByValSize();
// Round up to multiples of the pointer size, except for array members,
// which are always packed.
if (!Flags.isInConsecutiveRegs())
ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
return ArgSize;
/// CalculateStackSlotAlignment - Calculates the alignment of this argument
/// on the stack.
static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
ISD::ArgFlagsTy Flags,
unsigned PtrByteSize) {
Align Alignment(PtrByteSize);
// Altivec parameters are padded to a 16 byte boundary.
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
Alignment = Align(16);
// QPX vector types stored in double-precision are padded to a 32 byte
// boundary.
else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
Alignment = Align(32);
// ByVal parameters are aligned as requested.
if (Flags.isByVal()) {
auto BVAlign = Flags.getNonZeroByValAlign();
if (BVAlign > PtrByteSize) {
if (BVAlign.value() % PtrByteSize != 0)
"ByVal alignment is not a multiple of the pointer size");
Alignment = BVAlign;
// Array members are always packed to their original alignment.
if (Flags.isInConsecutiveRegs()) {
// If the array member was split into multiple registers, the first
// needs to be aligned to the size of the full type. (Except for
// ppcf128, which is only aligned as its f64 components.)
if (Flags.isSplit() && OrigVT != MVT::ppcf128)
Alignment = Align(OrigVT.getStoreSize());
Alignment = Align(ArgVT.getStoreSize());
return Alignment;
/// CalculateStackSlotUsed - Return whether this argument will use its
/// stack slot (instead of being passed in registers). ArgOffset,
/// AvailableFPRs, and AvailableVRs must hold the current argument
/// position, and will be updated to account for this argument.
static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
ISD::ArgFlagsTy Flags,
unsigned PtrByteSize,
unsigned LinkageSize,
unsigned ParamAreaSize,
unsigned &ArgOffset,
unsigned &AvailableFPRs,
unsigned &AvailableVRs, bool HasQPX) {
bool UseMemory = false;
// Respect alignment of argument on the stack.
Align Alignment =
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
ArgOffset = alignTo(ArgOffset, Alignment);
// If there's no space left in the argument save area, we must
// use memory (this check also catches zero-sized arguments).
if (ArgOffset >= LinkageSize + ParamAreaSize)
UseMemory = true;
// Allocate argument on the stack.
ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
if (Flags.isInConsecutiveRegsLast())
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
// If we overran the argument save area, we must use memory
// (this check catches arguments passed partially in memory)
if (ArgOffset > LinkageSize + ParamAreaSize)
UseMemory = true;
// However, if the argument is actually passed in an FPR or a VR,
// we don't use memory after all.
if (!Flags.isByVal()) {
if (ArgVT == MVT::f32 || ArgVT == MVT::f64 ||
// QPX registers overlap with the scalar FP registers.
(HasQPX && (ArgVT == MVT::v4f32 ||
ArgVT == MVT::v4f64 ||
ArgVT == MVT::v4i1)))
if (AvailableFPRs > 0) {
return false;
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
if (AvailableVRs > 0) {
return false;
return UseMemory;
/// EnsureStackAlignment - Round stack frame size up from NumBytes to
/// ensure minimum alignment required for target.
static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
unsigned NumBytes) {
return alignTo(NumBytes, Lowering->getStackAlign());
SDValue PPCTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
if (Subtarget.isAIXABI())
return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
if (Subtarget.is64BitELFABI())
return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
if (Subtarget.is32BitELFABI())
return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
// 32-bit SVR4 ABI Stack Frame Layout:
// +-----------------------------------+
// +--> | Back chain |
// | +-----------------------------------+
// | | Floating-point register save area |
// | +-----------------------------------+
// | | General register save area |
// | +-----------------------------------+
// | | CR save word |
// | +-----------------------------------+
// | | VRSAVE save word |
// | +-----------------------------------+
// | | Alignment padding |
// | +-----------------------------------+
// | | Vector register save area |
// | +-----------------------------------+
// | | Local variable space |
// | +-----------------------------------+
// | | Parameter list area |
// | +-----------------------------------+
// | | LR save word |
// | +-----------------------------------+
// SP--> +--- | Back chain |
// +-----------------------------------+
// Specifications:
// System V Application Binary Interface PowerPC Processor Supplement
// AltiVec Technology Programming Interface Manual
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
EVT PtrVT = getPointerTy(MF.getDataLayout());
// Potential tail calls could cause overwriting of argument stack slots.
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
(CallConv == CallingConv::Fast));
const Align PtrAlign(4);
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
// Reserve space for the linkage area on the stack.
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
CCInfo.AllocateStack(LinkageSize, PtrAlign);
if (useSoftFloat())
CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
// Arguments stored in registers.
if (VA.isRegLoc()) {
const TargetRegisterClass *RC;
EVT ValVT = VA.getValVT();
switch (ValVT.getSimpleVT().SimpleTy) {
llvm_unreachable("ValVT not supported by formal arguments Lowering");
case MVT::i1:
case MVT::i32:
RC = &PPC::GPRCRegClass;
case MVT::f32:
if (Subtarget.hasP8Vector())
RC = &PPC::VSSRCRegClass;
else if (Subtarget.hasSPE())
RC = &PPC::GPRCRegClass;
RC = &PPC::F4RCRegClass;
case MVT::f64:
if (Subtarget.hasVSX())
RC = &PPC::VSFRCRegClass;
else if (Subtarget.hasSPE())
// SPE passes doubles in GPR pairs.
RC = &PPC::GPRCRegClass;
RC = &PPC::F8RCRegClass;
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
RC = &PPC::VRRCRegClass;
case MVT::v4f32:
RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
case MVT::v2f64:
case MVT::v2i64:
RC = &PPC::VRRCRegClass;
case MVT::v4f64:
RC = &PPC::QFRCRegClass;
case MVT::v4i1:
RC = &PPC::QBRCRegClass;
SDValue ArgValue;
// Transform the arguments stored in physical registers into
// virtual ones.
if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
assert(i + 1 < e && "No second half of double precision argument");
unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC);
unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
if (!Subtarget.isLittleEndian())
std::swap (ArgValueLo, ArgValueHi);
ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
} else {
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
ValVT == MVT::i1 ? MVT::i32 : ValVT);
if (ValVT == MVT::i1)
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
} else {
// Argument stored in memory.
// Get the extended size of the argument type in stack
unsigned ArgSize = VA.getLocVT().getStoreSize();
// Get the actual size of the argument type
unsigned ObjSize = VA.getValVT().getStoreSize();
unsigned ArgOffset = VA.getLocMemOffset();
// Stack objects in PPC32 are right justified.
ArgOffset += ArgSize - ObjSize;
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
// Assign locations to all of the incoming aggregate by value arguments.
// Aggregates passed by value are stored in the local variable space of the
// caller's stack frame, right above the parameter list area.
SmallVector<CCValAssign, 16> ByValArgLocs;
CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
ByValArgLocs, *DAG.getContext());
// Reserve stack space for the allocations in CCInfo.
CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);
CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
// Area that is at least reserved in the caller of this function.
unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
MinReservedArea = std::max(MinReservedArea, LinkageSize);
// Set the size that is at least reserved in caller of this function. Tail
// call optimized function's reserved stack space needs to be aligned so that
// taking the difference between two stack areas will result in an aligned
// stack.
MinReservedArea =
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
SmallVector<SDValue, 8> MemOps;
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start.
if (isVarArg) {
static const MCPhysReg GPArgRegs[] = {
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
static const MCPhysReg FPArgRegs[] = {
PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
if (useSoftFloat() || hasSPE())
NumFPArgRegs = 0;
// Make room for NumGPArgRegs and NumFPArgRegs.
int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
CCInfo.getNextStackOffset(), true));
MFI.CreateStackObject(Depth, Align(8), false));
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
// The fixed integer arguments of a variadic function are stored to the
// VarArgsFrameIndex on the stack so that they may be loaded by
// dereferencing the result of va_next.
for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
// Get an existing live-in vreg, or add a new one.
unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
if (!VReg)
VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
// Increment the address by four for the next argument to store
SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
// FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
// is set.
// The double arguments are stored to the VarArgsFrameIndex
// on the stack.
for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
// Get an existing live-in vreg, or add a new one.
unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
if (!VReg)
VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
// Increment the address by eight for the next argument to store
SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
return Chain;
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
// value to MVT::i64 and then truncate to the correct register size.
SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
EVT ObjectVT, SelectionDAG &DAG,
SDValue ArgVal,
const SDLoc &dl) const {
if (Flags.isSExt())
ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
else if (Flags.isZExt())
ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
// TODO: add description of PPC stack frame format, or at least some docs.
bool isELFv2ABI = Subtarget.isELFv2ABI();
bool isLittleEndian = Subtarget.isLittleEndian();
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
assert(!(CallConv == CallingConv::Fast && isVarArg) &&
"fastcc not supported on varargs functions");
EVT PtrVT = getPointerTy(MF.getDataLayout());
// Potential tail calls could cause overwriting of argument stack slots.
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
(CallConv == CallingConv::Fast));
unsigned PtrByteSize = 8;
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
static const MCPhysReg GPR[] = {
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
static const MCPhysReg VR[] = {
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
const unsigned Num_GPR_Regs = array_lengthof(GPR);
const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
const unsigned Num_VR_Regs = array_lengthof(VR);
const unsigned Num_QFPR_Regs = Num_FPR_Regs;
// Do a first pass over the arguments to determine whether the ABI
// guarantees that our caller has allocated the parameter save area
// on its stack frame. In the ELFv1 ABI, this is always the case;
// in the ELFv2 ABI, it is true if this is a vararg function or if
// any parameter is located in a stack slot.
bool HasParameterArea = !isELFv2ABI || isVarArg;
unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
unsigned NumBytes = LinkageSize;
unsigned AvailableFPRs = Num_FPR_Regs;
unsigned AvailableVRs = Num_VR_Regs;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
if (Ins[i].Flags.isNest())
if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
PtrByteSize, LinkageSize, ParamAreaSize,
NumBytes, AvailableFPRs, AvailableVRs,
HasParameterArea = true;
// Add DAG nodes to load the arguments or copy them out of registers. On
// entry to a function on PPC, the arguments start after the linkage area,
// although the first ones are often in registers.
unsigned ArgOffset = LinkageSize;
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
unsigned &QFPR_idx = FPR_idx;
SmallVector<SDValue, 8> MemOps;
Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
SDValue ArgVal;
bool needsLoad = false;
EVT ObjectVT = Ins[ArgNo].VT;
EVT OrigVT = Ins[ArgNo].ArgVT;
unsigned ObjSize = ObjectVT.getStoreSize();
unsigned ArgSize = ObjSize;
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
if (Ins[ArgNo].isOrigArg()) {
std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
CurArgIdx = Ins[ArgNo].getOrigArgIndex();
// We re-align the argument offset for each argument, except when using the
// fast calling convention, when we need to make sure we do that only when
// we'll actually use a stack slot.
unsigned CurArgOffset;
Align Alignment;
auto ComputeArgOffset = [&]() {
/* Respect alignment of argument on the stack. */
Alignment =
CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
ArgOffset = alignTo(ArgOffset, Alignment);
CurArgOffset = ArgOffset;
if (CallConv != CallingConv::Fast) {
/* Compute GPR index associated with argument offset. */
GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
// FIXME the codegen can be much improved in some cases.
// We do not have to keep everything in memory.
if (Flags.isByVal()) {
assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
if (CallConv == CallingConv::Fast)
// ObjSize is the true size, ArgSize rounded up to multiple of registers.
ObjSize = Flags.getByValSize();
ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
// Empty aggregate parameters do not take up registers. Examples:
// struct { } a;
// union { } b;
// int c[0];
// etc. However, we have to provide a place-holder in InVals, so
// pretend we have an 8-byte item at the current address for that
// purpose.
if (!ObjSize) {
int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
// Create a stack object covering all stack doublewords occupied
// by the argument. If the argument is (fully or partially) on
// the stack, or if the argument is fully in registers but the
// caller has allocated the parameter save anyway, we can refer
// directly to the caller's stack frame. Otherwise, create a
// local copy in our own frame.
int FI;
if (HasParameterArea ||
ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
FI = MFI.CreateStackObject(ArgSize, Alignment, false);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
// Handle aggregates smaller than 8 bytes.
if (ObjSize < PtrByteSize) {
// The value of the object is its address, which differs from the
// address of the enclosing doubleword on big-endian systems.
SDValue Arg = FIN;
if (!isLittleEndian) {
SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
FuncInfo->addLiveInAttr(VReg, Flags);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store;
if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
EVT ObjType = (ObjSize == 1 ? MVT::i8 :
(ObjSize == 2 ? MVT::i16 : MVT::i32));
Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
MachinePointerInfo(&*FuncArg), ObjType);
} else {
// For sizes that don't fit a truncating store (3, 5, 6, 7),
// store the whole register as-is to the parameter save area
// slot.
Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
// Whether we copied from a register or not, advance the offset
// into the parameter save area by a full doubleword.
ArgOffset += PtrByteSize;
// The value of the object is its address, which is the address of
// its first stack doubleword.
// Store whatever pieces of the object are in registers to memory.
for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
if (GPR_idx == Num_GPR_Regs)
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
FuncInfo->addLiveInAttr(VReg, Flags);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Addr = FIN;
if (j) {
SDValue Off = DAG.getConstant(j, dl, PtrVT);
Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
MachinePointerInfo(&*FuncArg, j));
ArgOffset += ArgSize;
switch (ObjectVT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unhandled argument type!");
case MVT::i1:
case MVT::i32:
case MVT::i64:
if (Flags.isNest()) {
// The 'nest' parameter, if any, is passed in R11.
unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
// These can be scalar arguments or elements of an integer array type
// passed directly. Clang may use those instead of "byval" aggregate
// types to avoid forcing arguments to memory unnecessarily.
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
FuncInfo->addLiveInAttr(VReg, Flags);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
// value to MVT::i64 and then truncate to the correct register size.
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
} else {
if (CallConv == CallingConv::Fast)
needsLoad = true;
ArgSize = PtrByteSize;
if (CallConv != CallingConv::Fast || needsLoad)
ArgOffset += 8;
case MVT::f32:
case MVT::f64:
// These can be scalar arguments or elements of a float array type
// passed directly. The latter are used to implement ELFv2 homogenous
// float aggregates.
if (FPR_idx != Num_FPR_Regs) {
unsigned VReg;
if (ObjectVT == MVT::f32)
VReg = MF.addLiveIn(FPR[FPR_idx],
? &PPC::VSSRCRegClass
: &PPC::F4RCRegClass);
VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
? &PPC::VSFRCRegClass
: &PPC::F8RCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
} else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
// once we support fp <-> gpr moves.
// This can only ever happen in the presence of f32 array types,
// since otherwise we never run out of FPRs before running out
// of GPRs.
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
FuncInfo->addLiveInAttr(VReg, Flags);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
if (ObjectVT == MVT::f32) {
if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
DAG.getConstant(32, dl, MVT::i32));
ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
} else {
if (CallConv == CallingConv::Fast)
needsLoad = true;
// When passing an array of floats, the array occupies consecutive
// space in the argument area; only round up to the next doubleword
// at the end of the array. Otherwise, each float takes 8 bytes.
if (CallConv != CallingConv::Fast || needsLoad) {
ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
ArgOffset += ArgSize;
if (Flags.isInConsecutiveRegsLast())
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
case MVT::v2f64:
case MVT::v2i64:
case MVT::v1i128:
case MVT::f128:
if (!Subtarget.hasQPX()) {
// These can be scalar arguments or elements of a vector array type
// passed directly. The latter are used to implement ELFv2 homogenous
// vector aggregates.
if (VR_idx != Num_VR_Regs) {
unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
} else {
if (CallConv == CallingConv::Fast)
needsLoad = true;
if (CallConv != CallingConv::Fast || needsLoad)
ArgOffset += 16;
} // not QPX
assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
"Invalid QPX parameter type");
case MVT::v4f64:
case MVT::v4i1:
// QPX vectors are treated like their scalar floating-point subregisters
// (except that they're larger).
unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
if (QFPR_idx != Num_QFPR_Regs) {
const TargetRegisterClass *RC;
switch (ObjectVT.getSimpleVT().SimpleTy) {
case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
default: RC = &PPC::QBRCRegClass; break;
unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
} else {
if (CallConv == CallingConv::Fast)
needsLoad = true;
if (CallConv != CallingConv::Fast || needsLoad)
ArgOffset += Sz;
// We need to load the argument to a virtual register if we determined
// above that we ran out of physical registers of the appropriate type.
if (needsLoad) {
if (ObjSize < ArgSize && !isLittleEndian)
CurArgOffset += ArgSize - ObjSize;
int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
// Area that is at least reserved in the caller of this function.
unsigned MinReservedArea;
if (HasParameterArea)
MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
MinReservedArea = LinkageSize;
// Set the size that is at least reserved in caller of this function. Tail
// call optimized functions' reserved stack space needs to be aligned so that
// taking the difference between two stack areas will result in an aligned
// stack.
MinReservedArea =
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start.
// On ELFv2ABI spec, it writes:
// C programs that are intended to be *portable* across different compilers
// and architectures must use the header file <stdarg.h> to deal with variable
// argument lists.
if (isVarArg && MFI.hasVAStart()) {
int Depth = ArgOffset;
MFI.CreateFixedObject(PtrByteSize, Depth, true));
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
// If this function is vararg, store any remaining integer argument regs
// to their spots on the stack so that they may be loaded by dereferencing
// the result of va_next.
for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
GPR_idx < Num_GPR_Regs; ++GPR_idx) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
// Increment the address by four for the next argument to store
SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
return Chain;
SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
// TODO: add description of PPC stack frame format, or at least some docs.
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
EVT PtrVT = getPointerTy(MF.getDataLayout());
bool isPPC64 = PtrVT == MVT::i64;
// Potential tail calls could cause overwriting of argument stack slots.
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
(CallConv == CallingConv::Fast));
unsigned PtrByteSize = isPPC64 ? 8 : 4;
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
unsigned ArgOffset = LinkageSize;
// Area that is at least reserved in caller of this function.
unsigned MinReservedArea = ArgOffset;
static const MCPhysReg GPR_32[] = { // 32-bit registers.
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
static const MCPhysReg GPR_64[] = { // 64-bit registers.
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
static const MCPhysReg VR[] = {
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
const unsigned Num_VR_Regs = array_lengthof( VR);
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
// In 32-bit non-varargs functions, the stack space for vectors is after the
// stack space for non-vectors. We do not use this space unless we have
// too many vectors to fit in registers, something that only occurs in
// constructed examples:), but we have to walk the arglist to figure
// that out...for the pathological case, compute VecArgOffset as the
// start of the vector parameter area. Computing VecArgOffset is the
// entire point of the following loop.
unsigned VecArgOffset = ArgOffset;
if (!isVarArg && !isPPC64) {
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
++ArgNo) {
EVT ObjectVT = Ins[ArgNo].VT;
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
if (Flags.isByVal()) {
// ObjSize is the true size, ArgSize rounded up to multiple of regs.
unsigned ObjSize = Flags.getByValSize();
unsigned ArgSize =
((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
VecArgOffset += ArgSize;
switch(ObjectVT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unhandled argument type!");
case MVT::i1:
case MVT::i32:
case MVT::f32:
VecArgOffset += 4;
case MVT::i64: // PPC64
case MVT::f64:
// FIXME: We are guaranteed to be !isPPC64 at this point.
// Does MVT::i64 apply?
VecArgOffset += 8;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
// Nothing to do, we're only looking at Nonvector args here.
// We've found where the vector parameter area in memory is. Skip the
// first 12 parameters; these don't use that memory.
VecArgOffset = ((VecArgOffset+15)/16)*16;
VecArgOffset += 12*16;
// Add DAG nodes to load the arguments or copy them out of registers. On
// entry to a function on PPC, the arguments start after the linkage area,
// although the first ones are often in registers.
SmallVector<SDValue, 8> MemOps;
unsigned nAltivecParamsAtEnd = 0;
Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
unsigned CurArgIdx = 0;
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
SDValue ArgVal;
bool needsLoad = false;
EVT ObjectVT = Ins[ArgNo].VT;
unsigned ObjSize = ObjectVT.getSizeInBits()/8;
unsigned ArgSize = ObjSize;
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
if (Ins[ArgNo].isOrigArg()) {
std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
CurArgIdx = Ins[ArgNo].getOrigArgIndex();
unsigned CurArgOffset = ArgOffset;
// Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
if (isVarArg || isPPC64) {
MinReservedArea = ((MinReservedArea+15)/16)*16;
MinReservedArea += CalculateStackSlotSize(ObjectVT,
} else nAltivecParamsAtEnd++;
} else
// Calculate min reserved area.
MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
// FIXME the codegen can be much improved in some cases.
// We do not have to keep everything in memory.
if (Flags.isByVal()) {
assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
// ObjSize is the true size, ArgSize rounded up to multiple of registers.
ObjSize = Flags.getByValSize();
ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
// Objects of size 1 and 2 are right justified, everything else is
// left justified. This means the memory address is adjusted forwards.
if (ObjSize==1 || ObjSize==2) {
CurArgOffset = CurArgOffset + (4 - ObjSize);
// The value of the object is its address.
int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
if (ObjSize==1 || ObjSize==2) {
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg;
if (isPPC64)
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
SDValue Store =
DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
MachinePointerInfo(&*FuncArg), ObjType);
ArgOffset += PtrByteSize;
for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
// Store whatever pieces of the object are in registers
// to memory. ArgOffset will be the address of the beginning
// of the object.
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg;
if (isPPC64)
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
MachinePointerInfo(&*FuncArg, j));
ArgOffset += PtrByteSize;
} else {
ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
switch (ObjectVT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unhandled argument type!");
case MVT::i1:
case MVT::i32:
if (!isPPC64) {
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
if (ObjectVT == MVT::i1)
ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
} else {
needsLoad = true;
ArgSize = PtrByteSize;
// All int arguments reserve stack space in the Darwin ABI.
ArgOffset += PtrByteSize;
case MVT::i64: // PPC64
if (GPR_idx != Num_GPR_Regs) {
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
// value to MVT::i64 and then truncate to the correct register size.
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
} else {
needsLoad = true;
ArgSize = PtrByteSize;
// All int arguments reserve stack space in the Darwin ABI.
ArgOffset += 8;
case MVT::f32:
case MVT::f64:
// Every 4 bytes of argument space consumes one of the GPRs available for
// argument passing.
if (GPR_idx != Num_GPR_Regs) {
if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
if (FPR_idx != Num_FPR_Regs) {
unsigned VReg;
if (ObjectVT == MVT::f32)
VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
} else {
needsLoad = true;
// All FP arguments reserve stack space in the Darwin ABI.
ArgOffset += isPPC64 ? 8 : ObjSize;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
// Note that vector arguments in registers don't reserve stack space,
// except in varargs functions.
if (VR_idx != Num_VR_Regs) {
unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
if (isVarArg) {
while ((ArgOffset % 16) != 0) {
ArgOffset += PtrByteSize;
if (GPR_idx != Num_GPR_Regs)
ArgOffset += 16;
GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
} else {
if (!isVarArg && !isPPC64) {
// Vectors go after all the nonvectors.
CurArgOffset = VecArgOffset;
VecArgOffset += 16;
} else {
// Vectors are aligned.
ArgOffset = ((ArgOffset+15)/16)*16;
CurArgOffset = ArgOffset;
ArgOffset += 16;
needsLoad = true;
// We need to load the argument to a virtual register if we determined above
// that we ran out of physical registers of the appropriate type.
if (needsLoad) {
int FI = MFI.CreateFixedObject(ObjSize,
CurArgOffset + (ArgSize - ObjSize),
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
// Allow for Altivec parameters at the end, if needed.
if (nAltivecParamsAtEnd) {
MinReservedArea = ((MinReservedArea+15)/16)*16;
MinReservedArea += 16*nAltivecParamsAtEnd;
// Area that is at least reserved in the caller of this function.
MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
// Set the size that is at least reserved in caller of this function. Tail
// call optimized functions' reserved stack space needs to be aligned so that
// taking the difference between two stack areas will result in an aligned
// stack.
MinReservedArea =
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start.
if (isVarArg) {
int Depth = ArgOffset;
Depth, true));
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
// If this function is vararg, store any remaining integer argument regs
// to their spots on the stack so that they may be loaded by dereferencing
// the result of va_next.
for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
unsigned VReg;
if (isPPC64)
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
// Increment the address by four for the next argument to store
SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
return Chain;
/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
/// adjusted to accommodate the arguments for the tailcall.
static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
unsigned ParamSize) {
if (!isTailCall) return 0;
PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
unsigned CallerMinReservedArea = FI->getMinReservedArea();
int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
// Remember only if the new adjustment is bigger.
if (SPDiff < FI->getTailCallSPDelta())
return SPDiff;
static bool isFunctionGlobalAddress(SDValue Callee);
static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
const TargetMachine &TM) {
// It does not make sense to call callsShareTOCBase() with a caller that
// is PC Relative since PC Relative callers do not have a TOC.
#ifndef NDEBUG
const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);
assert(!STICaller->isUsingPCRelativeCalls() &&
"PC Relative callers do not have a TOC and cannot share a TOC Base");
// Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
// don't have enough information to determine if the caller and callee share
// the same TOC base, so we have to pessimistically assume they don't for
// correctness.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (!G)
return false;
const GlobalValue *GV = G->getGlobal();
// If the callee is preemptable, then the static linker will use a plt-stub
// which saves the toc to the stack, and needs a nop after the call
// instruction to convert to a toc-restore.
if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
return false;
// Functions with PC Relative enabled may clobber the TOC in the same DSO.
// We may need a TOC restore in the situation where the caller requires a
// valid TOC but the callee is PC Relative and does not.
const Function *F = dyn_cast<Function>(GV);
const GlobalAlias *Alias = dyn_cast<GlobalAlias>(GV);
// If we have an Alias we can try to get the function from there.
if (Alias) {
const GlobalObject *GlobalObj = Alias->getBaseObject();
F = dyn_cast<Function>(GlobalObj);
// If we still have no valid function pointer we do not have enough
// information to determine if the callee uses PC Relative calls so we must
// assume that it does.
if (!F)
return false;
// If the callee uses PC Relative we cannot guarantee that the callee won't
// clobber the TOC of the caller and so we must assume that the two
// functions do not share a TOC base.
const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(*F);
if (STICallee->isUsingPCRelativeCalls())
return false;
// The medium and large code models are expected to provide a sufficiently
// large TOC to provide all data addressing needs of a module with a
// single TOC.
if (CodeModel::Medium == TM.getCodeModel() ||
CodeModel::Large == TM.getCodeModel())
return true;
// Otherwise we need to ensure callee and caller are in the same section,
// since the linker may allocate multiple TOCs, and we don't know which
// sections will belong to the same TOC base.
if (!GV->isStrongDefinitionForLinker())
return false;
// Any explicitly-specified sections and section prefixes must also match.
// Also, if we're using -ffunction-sections, then each function is always in
// a different section (the same is true for COMDAT functions).
if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() ||
GV->getSection() != Caller->getSection())
return false;
if (const auto *F = dyn_cast<Function>(GV)) {
if (F->getSectionPrefix() != Caller->getSectionPrefix())
return false;
return true;
static bool
needStackSlotPassParameters(const PPCSubtarget &Subtarget,
const SmallVectorImpl<ISD::OutputArg> &Outs) {
const unsigned PtrByteSize = 8;
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
static const MCPhysReg GPR[] = {
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
static const MCPhysReg VR[] = {
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
const unsigned NumGPRs = array_lengthof(GPR);
const unsigned NumFPRs = 13;
const unsigned NumVRs = array_lengthof(VR);
const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
unsigned NumBytes = LinkageSize;
unsigned AvailableFPRs = NumFPRs;
unsigned AvailableVRs = NumVRs;
for (const ISD::OutputArg& Param : Outs) {
if (Param.Flags.isNest()) continue;
if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
PtrByteSize, LinkageSize, ParamAreaSize,
NumBytes, AvailableFPRs, AvailableVRs,
return true;
return false;
static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
if (CB.arg_size() != CallerFn->arg_size())
return false;
auto CalleeArgIter = CB.arg_begin();
auto CalleeArgEnd = CB.arg_end();
Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
const Value* CalleeArg = *CalleeArgIter;
const Value* CallerArg = &(*CallerArgIter);
if (CalleeArg == CallerArg)
// e.g. @caller([4 x i64] %a, [4 x i64] %b) {
// tail call @callee([4 x i64] undef, [4 x i64] %b)
// }
// 1st argument of callee is undef and has the same type as caller.
if (CalleeArg->getType() == CallerArg->getType() &&
return false;
return true;
// Returns true if TCO is possible between the callers and callees
// calling conventions.
static bool
areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
CallingConv::ID CalleeCC) {
// Tail calls are possible with fastcc and ccc.
auto isTailCallableCC = [] (CallingConv::ID CC){
return CC == CallingConv::C || CC == CallingConv::Fast;
if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
return false;
// We can safely tail call both fastcc and ccc callees from a c calling
// convention caller. If the caller is fastcc, we may have less stack space
// than a non-fastcc caller with the same signature so disable tail-calls in
// that case.
return CallerCC == CallingConv::C || CallerCC == CalleeCC;
bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
SDValue Callee, CallingConv::ID CalleeCC, const CallBase *CB, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
if (DisableSCO && !TailCallOpt) return false;
// Variadic argument functions are not supported.
if (isVarArg) return false;
auto &Caller = DAG.getMachineFunction().getFunction();
// Check that the calling conventions are compatible for tco.
if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
return false;
// Caller contains any byval parameter is not supported.
if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
return false;
// Callee contains any byval parameter is not supported, too.
// Note: This is a quick work around, because in some cases, e.g.
// caller's stack size > callee's stack size, we are still able to apply
// sibling call optimization. For example, gcc is able to do SCO for caller1
// in the following example, but not for caller2.
// struct test {
// long int a;
// char ary[56];
// } gTest;
// __attribute__((noinline)) int callee(struct test v, struct test *b) {
// b->a = v.a;
// return 0;
// }
// void caller1(struct test a, struct test c, struct test *b) {
// callee(gTest, b); }
// void caller2(struct test *b) { callee(gTest, b); }
if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
return false;
// If callee and caller use different calling conventions, we cannot pass
// parameters on stack since offsets for the parameter area may be different.
if (Caller.getCallingConv() != CalleeCC &&
needStackSlotPassParameters(Subtarget, Outs))
return false;
// All variants of 64-bit ELF ABIs without PC-Relative addressing require that
// the caller and callee share the same TOC for TCO/SCO. If the caller and
// callee potentially have different TOC bases then we cannot tail call since
// we need to restore the TOC pointer after the call.
// ref:
// We cannot guarantee this for indirect calls or calls to external functions.
// When PC-Relative addressing is used, the concept of the TOC is no longer
// applicable so this check is not required.
// Check first for indirect calls.
if (!Subtarget.isUsingPCRelativeCalls() &&
!isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee))
return false;
// Check if we share the TOC base.
if (!Subtarget.isUsingPCRelativeCalls() &&
!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
return false;
// TCO allows altering callee ABI, so we don't have to check further.
if (CalleeCC == CallingConv::Fast && TailCallOpt)
return true;
if (DisableSCO) return false;
// If callee use the same argument list that caller is using, then we can
// apply SCO on this case. If it is not, then we need to check if callee needs
// stack for passing arguments.
// PC Relative tail calls may not have a CallBase.
// If there is no CallBase we cannot verify if we have the same argument
// list so assume that we don't have the same argument list.
if (CB && !hasSameArgumentList(&Caller, *CB) &&
needStackSlotPassParameters(Subtarget, Outs))
return false;
else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
return false;
return true;
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
CallingConv::ID CalleeCC,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
SelectionDAG& DAG) const {
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
return false;
// Variable argument functions are not supported.
if (isVarArg)
return false;
MachineFunction &MF = DAG.getMachineFunction();
CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
// Functions containing by val parameters are not supported.
for (unsigned i = 0; i != Ins.size(); i++) {
ISD::ArgFlagsTy Flags = Ins[i].Flags;
if (Flags.isByVal()) return false;
// Non-PIC/GOT tail calls are supported.
if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
return true;
// At the moment we can only do local tail calls (in same module, hidden
// or protected) if we are generating PIC.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
return G->getGlobal()->hasHiddenVisibility()
|| G->getGlobal()->hasProtectedVisibility();
return false;
/// isCallCompatibleAddress - Return the immediate to use if the specified
/// 32-bit value is representable in the immediate field of a BxA instruction.
static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C) return nullptr;
int Addr = C->getZExtValue();
if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
SignExtend32<26>(Addr) != Addr)
return nullptr; // Top 6 bits have to be sext of immediate.
return DAG
(int)C->getZExtValue() >> 2, SDLoc(Op),
namespace {
struct TailCallArgumentInfo {
SDValue Arg;
SDValue FrameIdxOp;
int FrameIdx = 0;
TailCallArgumentInfo() = default;
} // end anonymous namespace
/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
static void StoreTailCallArgumentsToStackSlot(
SelectionDAG &DAG, SDValue Chain,
const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
SDValue Arg = TailCallArgs[i].Arg;
SDValue FIN = TailCallArgs[i].FrameIdxOp;
int FI = TailCallArgs[i].FrameIdx;
// Store relative to framepointer.
Chain, dl, Arg, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
/// the appropriate stack slot for the tail call optimized function call.
static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
SDValue OldRetAddr, SDValue OldFP,
int SPDiff, const SDLoc &dl) {
if (SPDiff) {
// Calculate the new stack slot for the return address.
MachineFunction &MF = DAG.getMachineFunction();
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
const PPCFrameLowering *FL = Subtarget.getFrameLowering();
bool isPPC64 = Subtarget.isPPC64();
int SlotSize = isPPC64 ? 8 : 4;
int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
NewRetAddrLoc, true);
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
MachinePointerInfo::getFixedStack(MF, NewRetAddr));
return Chain;
/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
/// the position of the argument.
static void
CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
SDValue Arg, int SPDiff, unsigned ArgOffset,
SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
int Offset = ArgOffset + SPDiff;
uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
SDValue FIN = DAG.getFrameIndex(FI, VT);
TailCallArgumentInfo Info;
Info.Arg = Arg;
Info.FrameIdxOp = FIN;
Info.FrameIdx = FI;
/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
/// stack slot. Returns the chain as result and the loaded frame pointers in
/// LROpOut/FPOpout. Used when tail calling.
SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
SDValue &FPOpOut, const SDLoc &dl) const {
if (SPDiff) {
// Load the LR and FP stack slot for later adjusting.
EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
LROpOut = getReturnAddrFrameIndex(DAG);
LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
Chain = SDValue(LROpOut.getNode(), 1);
return Chain;
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
/// by "Src" to address "Dst" of size "Size". Alignment information is
/// specified by the specific parameter attribute. The copy will be passed as
/// a byval function parameter.
/// Sometimes what we are copying is the end of a larger object, the part that
/// does not fit in registers.
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SDValue Chain, ISD::ArgFlagsTy Flags,
SelectionDAG &DAG, const SDLoc &dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
Flags.getNonZeroByValAlign(), false, false, false,
MachinePointerInfo(), MachinePointerInfo());
/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
/// tail calls.
static void LowerMemOpCallTo(
SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
if (!isTailCall) {
if (isVector) {
SDValue StackPtr;
if (isPPC64)
StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
DAG.getConstant(ArgOffset, dl, PtrVT));
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
// Calculate and remember argument location.
} else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
static void
PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
SDValue FPOp,
SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
// Emit a sequence of copyto/copyfrom virtual registers for arguments that
// might overwrite each other in case of tail call optimization.
SmallVector<SDValue, 8> MemOpChains2;
// Do not flag preceding copytoreg stuff together with the following stuff.
InFlag = SDValue();
StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
MemOpChains2, dl);
if (!MemOpChains2.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);
// Emit callseq_end just before tailcall node.
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
InFlag = Chain.getValue(1);
// Is this global address that of a function that can be called by name? (as
// opposed to something that must hold a descriptor for an indirect call).
static bool isFunctionGlobalAddress(SDValue Callee) {
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
if (Callee.getOpcode() == ISD::GlobalTLSAddress ||
Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
return false;
return G->getGlobal()->getValueType()->isFunctionTy();
return false;
SDValue PPCTargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
? RetCC_PPC_Cold
: RetCC_PPC);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Val;
if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
Chain = Lo.getValue(1);
InFlag = Lo.getValue(2);
VA = RVLocs[++i]; // skip ahead to next loc
SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
Chain = Hi.getValue(1);
InFlag = Hi.getValue(2);
if (!Subtarget.isLittleEndian())
std::swap (Lo, Hi);
Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);
} else {
Val = DAG.getCopyFromReg(Chain, dl,
VA.getLocReg(), VA.getLocVT(), InFlag);
Chain = Val.getValue(1);
InFlag = Val.getValue(2);
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::AExt:
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
case CCValAssign::ZExt:
Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
case CCValAssign::SExt:
Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
return Chain;
static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
const PPCSubtarget &Subtarget, bool isPatchPoint) {
// PatchPoint calls are not indirect.
if (isPatchPoint)
return false;
if (isFunctionGlobalAddress(Callee) || dyn_cast<ExternalSymbolSDNode>(Callee))
return false;
// Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
// becuase the immediate function pointer points to a descriptor instead of
// a function entry point. The ELFv2 ABI cannot use a BLA because the function
// pointer immediate points to the global entry point, while the BLA would
// need to jump to the local entry point (see rL211174).
if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
isBLACompatibleAddress(Callee, DAG))
return false;
return true;
// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
return Subtarget.isAIXABI() ||
(Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
const Function &Caller,
const SDValue &Callee,
const PPCSubtarget &Subtarget,
const TargetMachine &TM) {
if (CFlags.IsTailCall)
// This is a call through a function pointer.
if (CFlags.IsIndirect) {
// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
// indirect calls. The save of the caller's TOC pointer to the stack will be
// inserted into the DAG as part of call lowering. The restore of the TOC
// pointer is modeled by using a pseudo instruction for the call opcode that
// represents the 2 instruction sequence of an indirect branch and link,
// immediately followed by a load of the TOC pointer from the the stack save
// slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
// as it is not saved or used.
return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
if (Subtarget.isUsingPCRelativeCalls()) {
assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
// The ABIs that maintain a TOC pointer accross calls need to have a nop
// immediately following the call instruction if the caller and callee may
// have different TOC bases. At link time if the linker determines the calls
// may not share a TOC base, the call is redirected to a trampoline inserted
// by the linker. The trampoline will (among other things) save the callers
// TOC pointer at an ABI designated offset in the linkage area and the linker
// will rewrite the nop to be a load of the TOC pointer from the linkage area
// into gpr2.
if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
return PPCISD::CALL;
static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
const SDLoc &dl, const PPCSubtarget &Subtarget) {
if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
return SDValue(Dest, 0);
// Returns true if the callee is local, and false otherwise.
auto isLocalCallee = [&]() {
const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
const GlobalValue *GV = G ? G->getGlobal() : nullptr;
return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) &&
// The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
// a static relocation model causes some versions of GNU LD (2.17.50, at
// least) to force BSS-PLT, instead of secure-PLT, even if all objects are
// built with secure-PLT.
bool UsePlt =
Subtarget.is32BitELFABI() && !isLocalCallee() &&
Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
// On AIX, direct function calls reference the symbol for the function's
// entry point, which is named by prepending a "." before the function's
// C-linkage name.
const auto getAIXFuncEntryPointSymbolSDNode =
[&](StringRef FuncName, bool IsDeclaration,
const XCOFF::StorageClass &SC) {
auto &Context = DAG.getMachineFunction().getMMI().getContext();
MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(
Context.getOrCreateSymbol(Twine(".") + Twine(FuncName)));
if (IsDeclaration && !S->hasRepresentedCsectSet()) {
// On AIX, an undefined symbol needs to be associated with a
// MCSectionXCOFF to get the correct storage mapping class.
// In this case, XCOFF::XMC_PR.
MCSectionXCOFF *Sec = Context.getXCOFFSection(
S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
return DAG.getMCSymbol(S, PtrVT);
if (isFunctionGlobalAddress(Callee)) {
const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
const GlobalValue *GV = G->getGlobal();
if (!Subtarget.isAIXABI())
return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
UsePlt ? PPCII::MO_PLT : 0);
assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
const GlobalObject *GO = cast<GlobalObject>(GV);
const XCOFF::StorageClass SC =
return getAIXFuncEntryPointSymbolSDNode(GO->getName(), GO->isDeclaration(),
if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
const char *SymName = S->getSymbol();
if (!Subtarget.isAIXABI())
return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
UsePlt ? PPCII::MO_PLT : 0);
// If there exists a user-declared function whose name is the same as the
// ExternalSymbol's, then we pick up the user-declared version.
const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
if (const Function *F =
dyn_cast_or_null<Function>(Mod->getNamedValue(SymName))) {
const XCOFF::StorageClass SC =
return getAIXFuncEntryPointSymbolSDNode(F->getName(), F->isDeclaration(),
return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
// No transformation needed.
assert(Callee.getNode() && "What no callee?");
return Callee;
static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
"Expected a CALLSEQ_STARTSDNode.");
// The last operand is the chain, except when the node has glue. If the node
// has glue, then the last operand is the glue, and the chain is the second
// last operand.
SDValue LastValue = CallSeqStart.getValue(CallSeqStart->getNumValues() - 1);
if (LastValue.getValueType() != MVT::Glue)
return LastValue;
return CallSeqStart.getValue(CallSeqStart->getNumValues() - 2);
// Creates the node that moves a functions address into the count register
// to prepare for an indirect call instruction.
static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
SDValue &Glue, SDValue &Chain,
const SDLoc &dl) {
SDValue MTCTROps[] = {Chain, Callee, Glue};
EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
Chain = DAG.getNode(PPCISD::MTCTR, dl, makeArrayRef(ReturnTypes, 2),
makeArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
// The glue is the second value produced.
Glue = Chain.getValue(1);
static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
SDValue &Glue, SDValue &Chain,
SDValue CallSeqStart,
const CallBase *CB, const SDLoc &dl,
bool hasNest,
const PPCSubtarget &Subtarget) {
// Function pointers in the 64-bit SVR4 ABI do not point to the function
// entry point, but to the function descriptor (the function entry point
// address is part of the function descriptor though).
// The function descriptor is a three doubleword structure with the
// following fields: function entry point, TOC base address and
// environment pointer.
// Thus for a call through a function pointer, the following actions need
// to be performed:
// 1. Save the TOC of the caller in the TOC save area of its stack
// frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
// 2. Load the address of the function entry point from the function
// descriptor.
// 3. Load the TOC of the callee from the function descriptor into r2.
// 4. Load the environment pointer from the function descriptor into
// r11.
// 5. Branch to the function entry point address.
// 6. On return of the callee, the TOC of the caller needs to be
// restored (this is done in FinishCall()).
// The loads are scheduled at the beginning of the call sequence, and the
// register copies are flagged together to ensure that no other
// operations can be scheduled in between. E.g. without flagging the
// copies together, a TOC access in the caller could be scheduled between
// the assignment of the callee TOC and the branch to the callee, which leads
// to incorrect code.
// Start by loading the function address from the descriptor.
SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
? (MachineMemOperand::MODereferenceable |
: MachineMemOperand::MONone;
MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);
// Registers used in building the DAG.
const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
const MCRegister TOCReg = Subtarget.getTOCPointerRegister();
// Offsets of descriptor members.
const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();
const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
const unsigned Alignment = Subtarget.isPPC64() ? 8 : 4;
// One load for the functions entry point address.
SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI,
Alignment, MMOFlags);
// One for loading the TOC anchor for the module that contains the called
// function.
SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl);
SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff);
SDValue TOCPtr =
DAG.getLoad(RegVT, dl, LDChain, AddTOC,
MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags);
// One for loading the environment pointer.
SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl);
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff);
SDValue LoadEnvPtr =
DAG.getLoad(RegVT, dl, LDChain, AddPtr,
MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags);
// Then copy the newly loaded TOC anchor to the TOC pointer.
SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue);
Chain = TOCVal.getValue(0);
Glue = TOCVal.getValue(1);
// If the function call has an explicit 'nest' parameter, it takes the
// place of the environment pointer.
assert((!hasNest || !Subtarget.isAIXABI()) &&
"Nest parameter is not supported on AIX.");
if (!hasNest) {
SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue);
Chain = EnvVal.getValue(0);
Glue = EnvVal.getValue(1);
// The rest of the indirect call sequence is the same as the non-descriptor
// DAG.
prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl);
static void
buildCallOperands(SmallVectorImpl<SDValue> &Ops,
PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
SelectionDAG &DAG,
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
const PPCSubtarget &Subtarget) {
const bool IsPPC64 = Subtarget.isPPC64();
// MVT for a general purpose register.
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
// First operand is always the chain.
// If it's a direct call pass the callee as the second operand.
if (!CFlags.IsIndirect)
else {
assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");
// For the TOC based ABIs, we have saved the TOC pointer to the linkage area
// on the stack (this would have been done in `LowerCall_64SVR4` or
// `LowerCall_AIX`). The call instruction is a pseudo instruction that
// represents both the indirect branch and a load that restores the TOC
// pointer from the linkage area. The operand for the TOC restore is an add
// of the TOC save offset to the stack pointer. This must be the second
// operand: after the chain input but before any other variadic arguments.
// For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
// saved or used.
if (isTOCSaveRestoreRequired(Subtarget)) {
const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff);
// Add the register used for the environment pointer.
if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
// Add CTR register as callee so a bctr can be emitted later.
if (CFlags.IsTailCall)
Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));
// If this is a tail call add stack pointer delta.
if (CFlags.IsTailCall)
Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
// We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
// no way to mark dependencies as implicit here.
// We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&
!CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));
// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask =
TRI->getCallPreservedMask(DAG.getMachineFunction(), CFlags.CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
// If the glue is valid, it is the last operand.
if (Glue.getNode())
SDValue PPCTargetLowering::FinishCall(
CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {
if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||
unsigned CallOpc =
getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
Subtarget, DAG.getTarget());
if (!CFlags.IsIndirect)
Callee = transformCallee(Callee, DAG, dl, Subtarget);
else if (Subtarget.usesFunctionDescriptors())
prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
dl, CFlags.HasNest, Subtarget);
prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
// Build the operand list for the call instruction.
SmallVector<SDValue, 8> Ops;
buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
SPDiff, Subtarget);
// Emit tail call.
if (CFlags.IsTailCall) {
// Indirect tail call when using PC Relative calls do not have the same
// constraints.
assert(((Callee.getOpcode() == ISD::Register &&
cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
Callee.getOpcode() == ISD::TargetExternalSymbol ||
Callee.getOpcode() == ISD::TargetGlobalAddress ||
isa<ConstantSDNode>(Callee) ||
(CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
"Expecting a global address, external symbol, absolute value, "
"register or an indirect tail call when PC Relative calls are "
// PC Relative calls also use TC_RETURN as the way to mark tail calls.
assert(CallOpc == PPCISD::TC_RETURN &&
"Unexpected call opcode for a tail call.");
return DAG.getNode(CallOpc, dl, MVT::Other, Ops);
std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);
Glue = Chain.getValue(1);
// When performing tail call optimization the callee pops its arguments off
// the stack. Account for this here so these bytes can be pushed back on in
// PPCFrameLowering::eliminateCallFramePseudoInstr.
int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
? NumBytes
: 0;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
DAG.getIntPtrConstant(BytesCalleePops, dl, true),
Glue, dl);
Glue = Chain.getValue(1);
return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl,
DAG, InVals);
PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &dl = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &isTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool isVarArg = CLI.IsVarArg;
bool isPatchPoint = CLI.IsPatchPoint;
const CallBase *CB = CLI.CB;
if (isTailCall) {
if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
isTailCall = false;
else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
isTailCall = IsEligibleForTailCallOptimization_64SVR4(
Callee, CallConv, CB, isVarArg, Outs, Ins, DAG);
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
Ins, DAG);
if (isTailCall) {
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
// PC Relative calls no longer guarantee that the callee is a Global
// Address Node. The callee could be an indirect tail call in which
// case the SDValue for the callee could be a load (to load the address
// of a function pointer) or it may be a register copy (to move the
// address of the callee from a function parameter into a virtual
// register). It may also be an ExternalSymbolSDNode (ex memcopy).
assert((Subtarget.isUsingPCRelativeCalls() ||
isa<GlobalAddressSDNode>(Callee)) &&
"Callee should be an llvm::Function object.");
LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
<< "\nTCO callee: ");
if (!isTailCall && CB && CB->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// When long calls (i.e. indirect calls) are always used, calls are always
// made via function pointer. If we have a function name, first translate it
// into a pointer.
if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
Callee = LowerGlobalAddress(Callee, DAG);
CallFlags CFlags(
CallConv, isTailCall, isVarArg, isPatchPoint,
isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
// hasNest
Subtarget.is64BitELFABI() &&
any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
InVals, CB);
if (Subtarget.isSVR4ABI())
return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
InVals, CB);
if (Subtarget.isAIXABI())
return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
InVals, CB);
return LowerCall_Darwin(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
InVals, CB);
SDValue PPCTargetLowering::LowerCall_32SVR4(
SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
const CallBase *CB) const {
// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
// of the 32-bit SVR4 ABI stack frame layout.
const CallingConv::ID CallConv = CFlags.CallConv;
const bool IsVarArg = CFlags.IsVarArg;
const bool IsTailCall = CFlags.IsTailCall;
assert((CallConv == CallingConv::C ||
CallConv == CallingConv::Cold ||
CallConv == CallingConv::Fast) && "Unknown calling convention!");
const Align PtrAlign(4);
MachineFunction &MF = DAG.getMachineFunction();
// Mark this function as potentially containing a function that contains a
// tail call. As a consequence the frame pointer will be used for dynamicalloc
// and restoring the callers stack pointer in this functions epilog. This is
// done because by tail calling the called function might overwrite the value
// in this function's (MF) stack pointer stack slot 0(SP).
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
CallConv == CallingConv::Fast)
// Count how many bytes are to be pushed on the stack, including the linkage
// area, parameter list area and the part of the local variable space which
// contains copies of aggregates which are passed by value.
// Assign locations to all of the outgoing arguments.
SmallVector<CCValAssign, 16> ArgLocs;
PPCCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
// Reserve space for the linkage area on the stack.
if (useSoftFloat())
if (IsVarArg) {
// Handle fixed and variable vector arguments differently.
// Fixed vector arguments go into registers as long as registers are
// available. Variable vector arguments always go into memory.
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
bool Result;
if (Outs[i].IsFixed) {
Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
} else {
Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
ArgFlags, CCInfo);
if (Result) {
#ifndef NDEBUG
errs() << "Call operand #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << "\n";
} else {
// All arguments are treated the same.
CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
// Assign locations to all of the outgoing aggregate by value arguments.
SmallVector<CCValAssign, 16> ByValArgLocs;
CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());
// Reserve stack space for the allocations in CCInfo.
CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);
CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
// Size of the linkage area, parameter list area and the part of the local
// space variable where copies of aggregates which are passed by value are
// stored.
unsigned NumBytes = CCByValInfo.getNextStackOffset();
// Calculate by how many bytes the stack has to be adjusted in case of tail
// call optimization.
int SPDiff = CalculateTailCallSPDiff(DAG, IsTailCall, NumBytes);
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be moved somewhere else
// later.
SDValue LROp, FPOp;
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
// Set up a copy of the stack pointer for use loading and storing any
// arguments that may not fit in the registers available for argument
// passing.
SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
SmallVector<SDValue, 8> MemOpChains;
bool seenFloatArg = false;
// Walk the register/memloc assignments, inserting copies/loads.
// i - Tracks the index into the list of registers allocated for the call
// RealArgIdx - Tracks the index into the list of actual function arguments
// j - Tracks the index into the list of byval arguments
for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
i != e;
++i, ++RealArgIdx) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[RealArgIdx];
ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
if (Flags.isByVal()) {
// Argument is an aggregate which is passed by value, thus we need to
// create a copy of it in the local variable space of the current stack
// frame (which is the stack frame of the caller) and pass the address of
// this copy to the callee.
assert((j < ByValArgLocs.size()) && "Index out of bounds!");
CCValAssign &ByValVA = ByValArgLocs[j++];
assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
// Memory reserved in the local variable space of the callers stack frame.
unsigned LocMemOffset = ByValVA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
StackPtr, PtrOff);
// Create a copy of the argument in the local area of the current
// stack frame.
SDValue MemcpyCall =
CreateCopyOfByValArgument(Arg, PtrOff,
Flags, DAG, dl);
// This must go outside the CALLSEQ_START..END.
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
Chain = CallSeqStart = NewCallSeqStart;
// Pass the address of the aggregate copy on the stack either in a
// physical register or in the parameter list area of the current stack
// frame to the callee.
Arg = PtrOff;
// When useCRBits() is true, there can be i1 arguments.
// It is because getRegisterType(MVT::i1) => MVT::i1,
// and for other integer types getRegisterType() => MVT::i32.
// Extend i1 and ensure callee will get i32.
if (Arg.getValueType() == MVT::i1)
Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
dl, MVT::i32, Arg);
if (VA.isRegLoc()) {
seenFloatArg |= VA.getLocVT().isFloatingPoint();
// Put argument in a physical register.
if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
bool IsLE = Subtarget.isLittleEndian();
SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));
RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));
SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));
} else
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else {
// Put argument in the parameter list area of the current stack frame.
unsigned LocMemOffset = VA.getLocMemOffset();
if (!IsTailCall) {
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
StackPtr, PtrOff);
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
} else {
// Calculate and remember argument location.
CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
RegsToPass[i].second, InFlag);
InFlag = Chain.getValue(1);
// Set CR bit 6 to true if this is a vararg call with floating args passed in
// registers.
if (IsVarArg) {
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, InFlag };
Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
InFlag = Chain.getValue(1);
if (IsTailCall)
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
Callee, SPDiff, NumBytes, Ins, InVals, CB);
// Copy an argument into memory, being careful to do this outside the
// call sequence for the call to which the argument belongs.
SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
SelectionDAG &DAG, const SDLoc &dl) const {
SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
Flags, DAG, dl);
// The MEMCPY must go outside the CALLSEQ_START..END.
int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
return NewCallSeqStart;
SDValue PPCTargetLowering::LowerCall_64SVR4(
SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
const CallBase *CB) const {
bool isELFv2ABI = Subtarget.isELFv2ABI();
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned NumOps = Outs.size();
bool IsSibCall = false;
bool IsFastCall = CFlags.CallConv == CallingConv::Fast;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
unsigned PtrByteSize = 8;
MachineFunction &MF = DAG.getMachineFunction();
if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
IsSibCall = true;
// Mark this function as potentially containing a function that contains a
// tail call. As a consequence the frame pointer will be used for dynamicalloc
// and restoring the callers stack pointer in this functions epilog. This is
// done because by tail calling the called function might overwrite the value
// in this function's (MF) stack pointer stack slot 0(SP).
if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
assert(!(IsFastCall && CFlags.IsVarArg) &&
"fastcc not supported on varargs functions");
// Count how many bytes are to be pushed on the stack, including the linkage
// area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
// reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
// area is 32 bytes reserved space for [SP][CR][LR][TOC].
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
unsigned NumBytes = LinkageSize;
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
unsigned &QFPR_idx = FPR_idx;
static const MCPhysReg GPR[] = {
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
static const MCPhysReg VR[] = {
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
const unsigned NumGPRs = array_lengthof(GPR);
const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
const unsigned NumVRs = array_lengthof(VR);
const unsigned NumQFPRs = NumFPRs;
// On ELFv2, we can avoid allocating the parameter area if all the arguments
// can be passed to the callee in registers.
// For the fast calling convention, there is another check below.
// Note: We should keep consistent with LowerFormalArguments_64SVR4()
bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;
if (!HasParameterArea) {
unsigned ParamAreaSize = NumGPRs * PtrByteSize;
unsigned AvailableFPRs = NumFPRs;
unsigned AvailableVRs = NumVRs;
unsigned NumBytesTmp = NumBytes;
for (unsigned i = 0; i != NumOps; ++i) {
if (Outs[i].Flags.isNest()) continue;
if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
PtrByteSize, LinkageSize, ParamAreaSize,
NumBytesTmp, AvailableFPRs, AvailableVRs,
HasParameterArea = true;
// When using the fast calling convention, we don't provide backing for
// arguments that will be in registers.
unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
// Avoid allocating parameter area for fastcc functions if all the arguments
// can be passed in the registers.
if (IsFastCall)
HasParameterArea = false;
// Add up all the space actually used.
for (unsigned i = 0; i != NumOps; ++i) {
ISD::ArgFlagsTy Flags = Outs[i].Flags;
EVT ArgVT = Outs[i].VT;
EVT OrigVT = Outs[i].ArgVT;
if (Flags.isNest())
if (IsFastCall) {
if (Flags.isByVal()) {
NumGPRsUsed += (Flags.getByValSize()+7)/8;
if (NumGPRsUsed > NumGPRs)
HasParameterArea = true;
} else {
switch (ArgVT.getSimpleVT().SimpleTy) {
default: llvm_unreachable("Unexpected ValueType for argument!");
case MVT::i1:
case MVT::i32:
case MVT::i64:
if (++NumGPRsUsed <= NumGPRs)
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
case MVT::v2f64:
case MVT::v2i64:
case MVT::v1i128:
case MVT::f128:
if (++NumVRsUsed <= NumVRs)
case MVT::v4f32:
// When using QPX, this is handled like a FP register, otherwise, it
// is an Altivec register.
if (Subtarget.hasQPX()) {
if (++NumFPRsUsed <= NumFPRs)
} else {
if (++NumVRsUsed <= NumVRs)
case MVT::f32:
case MVT::f64:
case MVT::v4f64: // QPX
case MVT::v4i1: // QPX
if (++NumFPRsUsed <= NumFPRs)
HasParameterArea = true;
/* Respect alignment of argument on the stack. */
auto Alignement =
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
NumBytes = alignTo(NumBytes, Alignement);
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
if (Flags.isInConsecutiveRegsLast())
NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
unsigned NumBytesActuallyUsed = NumBytes;
// In the old ELFv1 ABI,
// the prolog code of the callee may store up to 8 GPR argument registers to
// the stack, allowing va_start to index over them in memory if its varargs.
// Because we cannot tell if this is needed on the caller side, we have to
// conservatively assume that it is needed. As such, make sure we have at
// least enough stack space for the caller to store the 8 GPRs.
// In the ELFv2 ABI, we allocate the parameter area iff a callee
// really requires memory operands, e.g. a vararg function.
if (HasParameterArea)
NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
NumBytes = LinkageSize;
// Tail call needs the stack to be aligned.
if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
int SPDiff = 0;
// Calculate by how many bytes the stack has to be adjusted in case of tail
// call optimization.
if (!IsSibCall)
SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
// To protect arguments on the stack from being clobbered in a tail call,
// force all the loads to happen before doing any other lowering.
if (CFlags.IsTailCall)
Chain = DAG.getStackArgumentTokenFactor(Chain);
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be move somewhere else
// later.
SDValue LROp, FPOp;
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
// Set up a copy of the stack pointer for use loading and storing any
// arguments that may not fit in the registers available for argument
// passing.
SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
// Figure out which arguments are going to go in registers, and which in
// memory. Also, if this is a vararg function, floating point operations
// must be stored to our stack, and loaded into integer regs as well, if
// any integer regs are available for argument passing.
unsigned ArgOffset = LinkageSize;
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
SmallVector<SDValue, 8> MemOpChains;
for (unsigned i = 0; i != NumOps; ++i) {
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
EVT ArgVT = Outs[i].VT;
EVT OrigVT = Outs[i].ArgVT;
// PtrOff will be used to store the current argument to the stack if a
// register cannot be found for it.
SDValue PtrOff;
// We re-align the argument offset for each argument, except when using the
// fast calling convention, when we need to make sure we do that only when
// we'll actually use a stack slot.
auto ComputePtrOff = [&]() {
/* Respect alignment of argument on the stack. */
auto Alignment =
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
ArgOffset = alignTo(ArgOffset, Alignment);
PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
if (!IsFastCall) {
/* Compute GPR index associated with argument offset. */
GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
GPR_idx = std::min(GPR_idx, NumGPRs);
// Promote integers to 64-bit values.
if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
// FIXME memcpy is used way more than necessary. Correctness first.
// Note: "by value" is code for passing a structure by value, not
// basic types.
if (Flags.isByVal()) {
// Note: Size includes alignment padding, so
// struct x { short a; char b; }
// will have Size = 4. With #pragma pack(1), it will have Size = 3.
// These are the proper values we need for right-justifying the
// aggregate in a parameter register.
unsigned Size = Flags.getByValSize();
// An empty aggregate parameter takes up no storage and no
// registers.
if (Size == 0)
if (IsFastCall)
// All aggregates smaller than 8 bytes must be passed right-justified.
if (Size==1 || Size==2 || Size==4) {
EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
if (GPR_idx != NumGPRs) {
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
MachinePointerInfo(), VT);
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
ArgOffset += PtrByteSize;
if (GPR_idx == NumGPRs && Size < 8) {
SDValue AddPtr = PtrOff;
if (!isLittleEndian) {
SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
Flags, DAG, dl);
ArgOffset += PtrByteSize;
// Copy entire object into memory. There are cases where gcc-generated
// code assumes it is there, even if it could be put entirely into
// registers. (This is not what the doc says.)
// FIXME: The above statement is likely due to a misunderstanding of the
// documents. All arguments must be copied into the parameter area BY
// THE CALLEE in the event that the callee takes the address of any
// formal argument. That has not yet been implemented. However, it is
// reasonable to use the stack area as a staging area for the register
// load.
// Skip this for small aggregates, as we will use the same slot for a
// right-justified copy, below.
if (Size >= 8)
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
Flags, DAG, dl);
// When a register is available, pass a small aggregate right-justified.
if (Size < 8 && GPR_idx != NumGPRs) {
// The easiest way to get this right-justified in a register
// is to copy the structure into the rightmost portion of a
// local variable slot, then load the whole slot into the
// register.
// FIXME: The memcpy seems to produce pretty awful code for
// small aggregates, particularly for packed ones.
// FIXME: It would be preferable to use the slot in the
// parameter save area instead of a new local variable.
SDValue AddPtr = PtrOff;
if (!isLittleEndian) {
SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
Flags, DAG, dl);
// Load the slot into the register.
SDValue Load =
DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
// Done with this argument.
ArgOffset += PtrByteSize;
// For aggregates larger than PtrByteSize, copy the pieces of the
// object that fit into registers from the parameter save area.
for (unsigned j=0; j<Size; j+=PtrByteSize) {
SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
if (GPR_idx != NumGPRs) {
SDValue Load =
DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
ArgOffset += PtrByteSize;
} else {
ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
switch (Arg.getSimpleValueType().SimpleTy) {
default: llvm_unreachable("Unexpected ValueType for argument!");
case MVT::i1:
case MVT::i32:
case MVT::i64:
if (Flags.isNest()) {
// The 'nest' parameter, if any, is passed in R11.
RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
// These can be scalar arguments or elements of an integer array type
// passed directly. Clang may use those instead of "byval" aggregate
// types to avoid forcing arguments to memory unnecessarily.
if (GPR_idx != NumGPRs) {
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
} else {
if (IsFastCall)
assert(HasParameterArea &&
"Parameter area must exist to pass an argument in memory.");
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
true, CFlags.IsTailCall, false, MemOpChains,
TailCallArguments, dl);
if (IsFastCall)
ArgOffset += PtrByteSize;
if (!IsFastCall)
ArgOffset += PtrByteSize;
case MVT::f32:
case MVT::f64: {
// These can be scalar arguments or elements of a float array type
// passed directly. The latter are used to implement ELFv2 homogenous
// float aggregates.
// Named arguments go into FPRs first, and once they overflow, the
// remaining arguments go into GPRs and then the parameter save area.
// Unnamed arguments for vararg functions always go to GPRs and
// then the parameter save area. For now, put all arguments to vararg
// routines always in both locations (FPR *and* GPR or stack slot).
bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;
bool NeededLoad = false;
// First load the argument into the next available FPR.
if (FPR_idx != NumFPRs)
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
// Next, load the argument into GPR or stack slot if needed.
if (!NeedGPROrStack)
else if (GPR_idx != NumGPRs && !IsFastCall) {
// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
// once we support fp <-> gpr moves.
// In the non-vararg case, this can only ever happen in the
// presence of f32 array types, since otherwise we never run
// out of FPRs before running out of GPRs.
SDValue ArgVal;
// Double values are always passed in a single GPR.
if (Arg.getValueType() != MVT::f32) {
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
// Non-array float values are extended and passed in a GPR.
} else if (!Flags.isInConsecutiveRegs()) {
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
// If we have an array of floats, we collect every odd element
// together with its predecessor into one GPR.
} else if (ArgOffset % PtrByteSize != 0) {
SDValue Lo, Hi;
Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
if (!isLittleEndian)
std::swap(Lo, Hi);
ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
// The final element, if even, goes into the first half of a GPR.
} else if (Flags.isInConsecutiveRegsLast()) {
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
if (!isLittleEndian)
ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
DAG.getConstant(32, dl, MVT::i32));
// Non-final even elements are skipped; they will be handled
// together the with subsequent argument on the next go-around.
} else
ArgVal = SDValue();
if (ArgVal.getNode())
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
} else {
if (IsFastCall)
// Single-precision floating-point values are mapped to the
// second (rightmost) word of the stack doubleword.
if (Arg.getValueType() == MVT::f32 &&
!isLittleEndian && !Flags.isInConsecutiveRegs()) {
SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
assert(HasParameterArea &&
"Parameter area must exist to pass an argument in memory.");
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
true, CFlags.IsTailCall, false, MemOpChains,
TailCallArguments, dl);
NeededLoad = true;
// When passing an array of floats, the array occupies consecutive
// space in the argument area; only round up to the next doubleword
// at the end of the array. Otherwise, each float takes 8 bytes.
if (!IsFastCall || NeededLoad) {
ArgOffset += (Arg.getValueType() == MVT::f32 &&
Flags.isInConsecutiveRegs()) ? 4 : 8;
if (Flags.isInConsecutiveRegsLast())
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
case MVT::v2f64:
case MVT::v2i64:
case MVT::v1i128:
case MVT::f128:
if (!Subtarget.hasQPX()) {
// These can be scalar arguments or elements of a vector array type
// passed directly. The latter are used to implement ELFv2 homogenous
// vector aggregates.
// For a varargs call, named arguments go into VRs or on the stack as
// usual; unnamed arguments always go to the stack or the corresponding
// GPRs when within range. For now, we always put the value in both
// locations (or even all three).
if (CFlags.IsVarArg) {
assert(HasParameterArea &&
"Parameter area must exist if we have a varargs call.");
// We could elide this store in the case where the object fits
// entirely in R registers. Maybe later.
SDValue Store =
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
if (VR_idx != NumVRs) {
SDValue Load =
DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
ArgOffset += 16;
for (unsigned i=0; i<16; i+=PtrByteSize) {
if (GPR_idx == NumGPRs)
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
DAG.getConstant(i, dl, PtrVT));
SDValue Load =
DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
// Non-varargs Altivec params go into VRs or on the stack.
if (VR_idx != NumVRs) {
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
} else {
if (IsFastCall)
assert(HasParameterArea &&
"Parameter area must exist to pass an argument in memory.");
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
true, CFlags.IsTailCall, true, MemOpChains,
TailCallArguments, dl);
if (IsFastCall)
ArgOffset += 16;
if (!IsFastCall)
ArgOffset += 16;
} // not QPX
assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
"Invalid QPX parameter type");
case MVT::v4f64:
case MVT::v4i1: {
bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
if (CFlags.IsVarArg) {
assert(HasParameterArea &&
"Parameter area must exist if we have a varargs call.");
// We could elide this store in the case where the object fits
// entirely in R registers. Maybe later.
SDValue Store =
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
if (QFPR_idx != NumQFPRs) {
SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
PtrOff, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
ArgOffset += (IsF32 ? 16 : 32);
for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
if (GPR_idx == NumGPRs)
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
DAG.getConstant(i, dl, PtrVT));
SDValue Load =
DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
// Non-varargs QPX params go into registers or on the stack.
if (QFPR_idx != NumQFPRs) {
RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
} else {
if (IsFastCall)
assert(HasParameterArea &&
"Parameter area must exist to pass an argument in memory.");
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
true, CFlags.IsTailCall, true, MemOpChains,
TailCallArguments, dl);
if (IsFastCall)
ArgOffset += (IsF32 ? 16 : 32);
if (!IsFastCall)
ArgOffset += (IsF32 ? 16 : 32);
assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
"mismatch in size of parameter area");
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
// Check if this is an indirect call (MTCTR/BCTRL).
// See prepareDescriptorIndirectCall and buildCallOperands for more
// information about calls through function pointers in the 64-bit SVR4 ABI.
if (CFlags.IsIndirect) {
// For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
// caller in the TOC save area.
if (isTOCSaveRestoreRequired(Subtarget)) {
assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
// Load r2 into a virtual register and store it to the TOC save area.
SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
// TOC save area offset.
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
DAG.getMachineFunction(), TOCSaveOffset));
// In the ELFv2 ABI, R12 must contain the address of an indirect callee.
// This does not mean the MTCTR instruction must use R12; it's easier
// to model this as an extra parameter, so do that.
if (isELFv2ABI && !CFlags.IsPatchPoint)
RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
RegsToPass[i].second, InFlag);
InFlag = Chain.getValue(1);
if (CFlags.IsTailCall && !IsSibCall)
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
Callee, SPDiff, NumBytes, Ins, InVals, CB);
SDValue PPCTargetLowering::LowerCall_Darwin(
SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
const CallBase *CB) const {
unsigned NumOps = Outs.size();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
bool isPPC64 = PtrVT == MVT::i64;
unsigned PtrByteSize = isPPC64 ? 8 : 4;
MachineFunction &MF = DAG.getMachineFunction();
// Mark this function as potentially containing a function that contains a
// tail call. As a consequence the frame pointer will be used for dynamicalloc
// and restoring the callers stack pointer in this functions epilog. This is
// done because by tail calling the called function might overwrite the value
// in this function's (MF) stack pointer stack slot 0(SP).
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
CFlags.CallConv == CallingConv::Fast)
// Count how many bytes are to be pushed on the stack, including the linkage
// area, and parameter passing area. We start with 24/48 bytes, which is
// prereserved space for [SP][CR][LR][3 x unused].
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
unsigned NumBytes = LinkageSize;
// Add up all the space actually used.
// In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
// they all go in registers, but we must reserve stack space for them for
// possible use by the caller. In varargs or 64-bit calls, parameters are
// assigned stack space in order, with padding so Altivec parameters are
// 16-byte aligned.
unsigned nAltivecParamsAtEnd = 0;
for (unsigned i = 0; i != NumOps; ++i) {
ISD::ArgFlagsTy Flags = Outs[i].Flags;
EVT ArgVT = Outs[i].VT;
// Varargs Altivec parameters are padded to a 16 byte boundary.
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
if (!CFlags.IsVarArg && !isPPC64) {
// Non-varargs Altivec parameters go after all the non-Altivec
// parameters; handle those later so we know how much padding we need.
// Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
NumBytes = ((NumBytes+15)/16)*16;
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
// Allow for Altivec parameters at the end, if needed.
if (nAltivecParamsAtEnd) {
NumBytes = ((NumBytes+15)/16)*16;
NumBytes += 16*nAltivecParamsAtEnd;
// The prolog code of the callee may store up to 8 GPR argument registers to
// the stack, allowing va_start to index over them in memory if its varargs.
// Because we cannot tell if this is needed on the caller side, we have to
// conservatively assume that it is needed. As such, make sure we have at
// least enough stack space for the caller to store the 8 GPRs.
NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
// Tail call needs the stack to be aligned.
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
CFlags.CallConv == CallingConv::Fast)
NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
// Calculate by how many bytes the stack has to be adjusted in case of tail
// call optimization.
int SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
// To protect arguments on the stack from being clobbered in a tail call,
// force all the loads to happen before doing any other lowering.
if (CFlags.IsTailCall)
Chain = DAG.getStackArgumentTokenFactor(Chain);
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be move somewhere else
// later.
SDValue LROp, FPOp;
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
// Set up a copy of the stack pointer for use loading and storing any
// arguments that may not fit in the registers available for argument
// passing.
SDValue StackPtr;
if (isPPC64)
StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
// Figure out which arguments are going to go in registers, and which in
// memory. Also, if this is a vararg function, floating point operations
// must be stored to our stack, and loaded into integer regs as well, if
// any integer regs are available for argument passing.
unsigned ArgOffset = LinkageSize;
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
static const MCPhysReg GPR_32[] = { // 32-bit registers.
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
static const MCPhysReg GPR_64[] = { // 64-bit registers.
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
static const MCPhysReg VR[] = {
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
const unsigned NumGPRs = array_lengthof(GPR_32);
const unsigned NumFPRs = 13;
const unsigned NumVRs = array_lengthof(VR);
const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
SmallVector<SDValue, 8> MemOpChains;
for (unsigned i = 0; i != NumOps; ++i) {
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
// PtrOff will be used to store the current argument to the stack if a
// register cannot be found for it.
SDValue PtrOff;
PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
// On PPC64, promote integers to 64-bit values.
if (isPPC64 && Arg.getValueType() == MVT::i32) {
// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
// FIXME memcpy is used way more than necessary. Correctness first.
// Note: "by value" is code for passing a structure by value, not
// basic types.
if (Flags.isByVal()) {
unsigned Size = Flags.getByValSize();
// Very small objects are passed right-justified. Everything else is
// passed left-justified.
if (Size==1 || Size==2) {
EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
if (GPR_idx != NumGPRs) {
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
MachinePointerInfo(), VT);
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
ArgOffset += PtrByteSize;
} else {
SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
Flags, DAG, dl);
ArgOffset += PtrByteSize;
// Copy entire object into memory. There are cases where gcc-generated
// code assumes it is there, even if it could be put entirely into
// registers. (This is not what the doc says.)
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
Flags, DAG, dl);
// For small aggregates (Darwin only) and aggregates >= PtrByteSize,
// copy the pieces of the object that fit into registers from the
// parameter save area.
for (unsigned j=0; j<Size; j+=PtrByteSize) {
SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
if (GPR_idx != NumGPRs) {
SDValue Load =
DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
ArgOffset += PtrByteSize;
} else {
ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
switch (Arg.getSimpleValueType().SimpleTy) {
default: llvm_unreachable("Unexpected ValueType for argument!");
case MVT::i1:
case MVT::i32:
case MVT::i64:
if (GPR_idx != NumGPRs) {
if (Arg.getValueType() == MVT::i1)
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
} else {
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
isPPC64, CFlags.IsTailCall, false, MemOpChains,
TailCallArguments, dl);
ArgOffset += PtrByteSize;
case MVT::f32:
case MVT::f64:
if (FPR_idx != NumFPRs) {
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
if (CFlags.IsVarArg) {
SDValue Store =
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
// Float varargs are always shadowed in available integer registers
if (GPR_idx != NumGPRs) {
SDValue Load =
DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
SDValue Load =
DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
} else {
// If we have any FPRs remaining, we may also have GPRs remaining.
// Args passed in FPRs consume either 1 (f32) or 2 (f64) available
// GPRs.
if (GPR_idx != NumGPRs)
if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
!isPPC64) // PPC64 has 64-bit GPR's obviously :)
} else
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
isPPC64, CFlags.IsTailCall, false, MemOpChains,
TailCallArguments, dl);
if (isPPC64)
ArgOffset += 8;
ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
if (CFlags.IsVarArg) {
// These go aligned on the stack, or in the corresponding R registers
// when within range. The Darwin PPC ABI doc claims they also go in
// V registers; in fact gcc does this only for arguments that are
// prototyped, not for those that match the ... We do it for all
// arguments, seems to work.
while (ArgOffset % 16 !=0) {
ArgOffset += PtrByteSize;
if (GPR_idx != NumGPRs)
// We could elide this store in the case where the object fits
// entirely in R registers. Maybe later.
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
DAG.getConstant(ArgOffset, dl, PtrVT));
SDValue Store =
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
if (VR_idx != NumVRs) {
SDValue Load =
DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
ArgOffset += 16;
for (unsigned i=0; i<16; i+=PtrByteSize) {
if (GPR_idx == NumGPRs)
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
DAG.getConstant(i, dl, PtrVT));
SDValue Load =
DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
// Non-varargs Altivec params generally go in registers, but have
// stack space allocated at the end.
if (VR_idx != NumVRs) {
// Doesn't have GPR space allocated.
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
} else if (nAltivecParamsAtEnd==0) {
// We are emitting Altivec params in order.
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
isPPC64, CFlags.IsTailCall, true, MemOpChains,
TailCallArguments, dl);
ArgOffset += 16;
// If all Altivec parameters fit in registers, as they usually do,
// they get stack space following the non-Altivec parameters. We
// don't track this here because nobody below needs it.
// If there are more Altivec parameters than fit in registers emit
// the stores here.
if (!CFlags.IsVarArg && nAltivecParamsAtEnd > NumVRs) {
unsigned j = 0;
// Offset is aligned; skip 1st 12 params which go in V registers.
ArgOffset = ((ArgOffset+15)/16)*16;
ArgOffset += 12*16;
for (unsigned i = 0; i != NumOps; ++i) {
SDValue Arg = OutVals[i];
EVT ArgType = Outs[i].VT;
if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
if (++j > NumVRs) {
SDValue PtrOff;
// We are emitting Altivec params in order.
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
isPPC64, CFlags.IsTailCall, true, MemOpChains,
TailCallArguments, dl);
ArgOffset += 16;
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
// On Darwin, R12 must contain the address of an indirect callee. This does
// not mean the MTCTR instruction must use R12; it's easier to model this as
// an extra parameter, so do that.
if (CFlags.IsIndirect) {
assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
PPC::R12), Callee));
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
RegsToPass[i].second, InFlag);
InFlag = Chain.getValue(1);
if (CFlags.IsTailCall)
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
Callee, SPDiff, NumBytes, Ins, InVals, CB);
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
CCState &State) {
const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
const bool IsPPC64 = Subtarget.isPPC64();
const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
assert((!ValVT.isInteger() ||
(ValVT.getSizeInBits() <= RegVT.getSizeInBits())) &&
"Integer argument exceeds register size: should have been legalized");
if (ValVT == MVT::f128)
report_fatal_error("f128 is unimplemented on AIX.");
if (ArgFlags.isNest())
report_fatal_error("Nest arguments are unimplemented.");
if (ValVT.isVector() || LocVT.isVector())
report_fatal_error("Vector arguments are unimplemented on AIX.");
static const MCPhysReg GPR_32[] = {// 32-bit registers.
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
PPC::R7, PPC::R8, PPC::R9, PPC::R10};
static const MCPhysReg GPR_64[] = {// 64-bit registers.
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10};
if (ArgFlags.isByVal()) {
if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
report_fatal_error("Pass-by-value arguments with alignment greater than "
"register width are not supported.");
const unsigned ByValSize = ArgFlags.getByValSize();
// An empty aggregate parameter takes up no storage and no registers,
// but needs a MemLoc for a stack slot for the formal arguments side.
if (ByValSize == 0) {
State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
State.getNextStackOffset(), RegVT,
return false;
const unsigned StackSize = alignTo(ByValSize, PtrAlign);
unsigned Offset = State.AllocateStack(StackSize, PtrAlign);
for (const unsigned E = Offset + StackSize; Offset < E;
Offset += PtrAlign.value()) {
if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
else {
State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
return false;
// Arguments always reserve parameter save area.
switch (ValVT.SimpleTy) {
report_fatal_error("Unhandled value type for argument.");
case MVT::i64:
// i64 arguments should have been split to i32 for PPC32.
assert(IsPPC64 && "PPC32 should have split i64 values.");
case MVT::i1:
case MVT::i32: {
const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
// AIX integer arguments are always passed in register width.
if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
: CCValAssign::LocInfo::ZExt;
if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
return false;
case MVT::f32:
case MVT::f64: {
// Parameter save area (PSA) is reserved even if the float passes in fpr.
const unsigned StoreSize = LocVT.getStoreSize();
// Floats are always 4-byte aligned in the PSA on AIX.
// This includes f64 in 64-bit mode for ABI compatibility.
const unsigned Offset =
State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));
unsigned FReg = State.AllocateReg(FPR);
if (FReg)
State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
// Reserve and initialize GPRs or initialize the PSA as required.
for (unsigned I = 0; I < StoreSize; I += PtrAlign.value()) {
if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
assert(FReg && "An FPR should be available when a GPR is reserved.");
if (State.isVarArg()) {
// Successfully reserved GPRs are only initialized for vararg calls.
// Custom handling is required for:
// f64 in PPC32 needs to be split into 2 GPRs.
// f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
} else {
// If there are insufficient GPRs, the PSA needs to be initialized.
// Initialization occurs even if an FPR was initialized for
// compatibility with the AIX XL compiler. The full memory for the
// argument will be initialized even if a prior word is saved in GPR.
// A custom memLoc is used when the argument also passes in FPR so
// that the callee handling can skip over it easily.
FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
return false;
return true;
static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
bool IsPPC64) {
assert((IsPPC64 || SVT != MVT::i64) &&
"i64 should have been split for 32-bit codegen.");
switch (SVT) {
report_fatal_error("Unexpected value type for formal argument");
case MVT::i1:
case MVT::i32:
case MVT::i64:
return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
case MVT::f32:
return &PPC::F4RCRegClass;
case MVT::f64:
return &PPC::F8RCRegClass;
static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
SelectionDAG &DAG, SDValue ArgValue,
MVT LocVT, const SDLoc &dl) {
assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
assert(ValVT.getSizeInBits() < LocVT.getSizeInBits());
if (Flags.isSExt())
ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
else if (Flags.isZExt())
ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
const unsigned LASize = FL->getLinkageSize();
if (PPC::GPRCRegClass.contains(Reg)) {
assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
"Reg must be a valid argument register!");
return LASize + 4 * (Reg - PPC::R3);
if (PPC::G8RCRegClass.contains(Reg)) {
assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
"Reg must be a valid argument register!");
return LASize + 8 * (Reg - PPC::X3);
llvm_unreachable("Only general purpose registers expected.");
// AIX ABI Stack Frame Layout:
// Low Memory +--------------------------------------------+
// SP +---> | Back chain | ---+
// | +--------------------------------------------+ |
// | | Saved Condition Register | |
// | +--------------------------------------------+ |
// | | Saved Linkage Register | |
// | +--------------------------------------------+ | Linkage Area
// | | Reserved for compilers | |
// | +--------------------------------------------+ |
// | | Reserved for binders | |
// | +--------------------------------------------+ |
// | | Saved TOC pointer | ---+
// | +--------------------------------------------+
// | | Parameter save area |
// | +--------------------------------------------+
// | | Alloca space |
// | +--------------------------------------------+
// | | Local variable space |
// | +--------------------------------------------+
// | | Float/int conversion temporary |
// | +--------------------------------------------+
// | | Save area for AltiVec registers |
// | +--------------------------------------------+
// | | AltiVec alignment padding |
// | +--------------------------------------------+
// | | Save area for VRSAVE register |
// | +--------------------------------------------+
// | | Save area for General Purpose registers |
// | +--------------------------------------------+
// | | Save area for Floating Point registers |
// | +--------------------------------------------+
// +---- | Back chain |
// High Memory +--------------------------------------------+
// Specifications:
// AIX 7.2 Assembler Language Reference
// Subroutine linkage convention
SDValue PPCTargetLowering::LowerFormalArguments_AIX(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||
CallConv == CallingConv::Fast) &&
"Unexpected calling convention!");
if (getTargetMachine().Options.GuaranteedTailCallOpt)
report_fatal_error("Tail call support is unimplemented on AIX.");
if (useSoftFloat())
report_fatal_error("Soft float support is unimplemented on AIX.");
const PPCSubtarget &Subtarget =
static_cast<const PPCSubtarget &>(DAG.getSubtarget());
if (Subtarget.hasQPX())
report_fatal_error("QPX support is not supported on AIX.");
const bool IsPPC64 = Subtarget.isPPC64();
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
const EVT PtrVT = getPointerTy(MF.getDataLayout());
// Reserve space for the linkage area on the stack.
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
SmallVector<SDValue, 8> MemOps;
for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
CCValAssign &VA = ArgLocs[I++];
MVT LocVT = VA.getLocVT();
ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
// For compatibility with the AIX XL compiler, the float args in the
// parameter save area are initialized even if the argument is available
// in register. The caller is required to initialize both the register
// and memory, however, the callee can choose to expect it in either.
// The memloc is dismissed here because the argument is retrieved from
// the register.
if (VA.isMemLoc() && VA.needsCustom())
if (Flags.isByVal() && VA.isMemLoc()) {
const unsigned Size =
alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
const int FI = MF.getFrameInfo().CreateFixedObject(
Size, VA.getLocMemOffset(), /* IsImmutable */ false,
/* IsAliased */ true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
if (Flags.isByVal()) {
assert(VA.isRegLoc() && "MemLocs should already be handled.");
const MCPhysReg ArgReg = VA.getLocReg();
const PPCFrameLowering *FL = Subtarget.getFrameLowering();
if (Flags.getNonZeroByValAlign() > PtrByteSize)
report_fatal_error("Over aligned byvals not supported yet.");
const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);
const int FI = MF.getFrameInfo().CreateFixedObject(
StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,
/* IsAliased */ true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
// Add live ins for all the RegLocs for the same ByVal.
const TargetRegisterClass *RegClass =
IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
unsigned Offset) {
const unsigned VReg = MF.addLiveIn(PhysReg, RegClass);
// Since the callers side has left justified the aggregate in the
// register, we can simply store the entire register into the stack
// slot.
SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
// The store to the fixedstack object is needed becuase accessing a
// field of the ByVal will use a gep and load. Ideally we will optimize
// to extracting the value from the register directly, and elide the
// stores when the arguments address is not taken, but that will need to
// be future work.
SDValue Store =
DAG.getStore(CopyFrom.getValue(1), dl, CopyFrom,
DAG.getObjectPtrOffset(dl, FIN, Offset),
MachinePointerInfo::getFixedStack(MF, FI, Offset));
unsigned Offset = 0;
HandleRegLoc(VA.getLocReg(), Offset);
Offset += PtrByteSize;
for (; Offset != StackSize && ArgLocs[I].isRegLoc();
Offset += PtrByteSize) {
assert(ArgLocs[I].getValNo() == VA.getValNo() &&
"RegLocs should be for ByVal argument.");
const CCValAssign RL = ArgLocs[I++];
HandleRegLoc(RL.getLocReg(), Offset);
if (Offset != StackSize) {
assert(ArgLocs[I].getValNo() == VA.getValNo() &&
"Expected MemLoc for remaining bytes.");
assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
// Consume the MemLoc.The InVal has already been emitted, so nothing
// more needs to be done.
EVT ValVT = VA.getValVT();
if (VA.isRegLoc() && !VA.needsCustom()) {
MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
unsigned VReg =
MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
if (ValVT.isScalarInteger() &&
(ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
ArgValue =
truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
if (VA.isMemLoc()) {
const unsigned LocSize = LocVT.getStoreSize();
const unsigned ValSize = ValVT.getStoreSize();
assert((ValSize <= LocSize) &&
"Object size is larger than size of MemLoc");
int CurArgOffset = VA.getLocMemOffset();
// Objects are right-justified because AIX is big-endian.
if (LocSize > ValSize)
CurArgOffset += LocSize - ValSize;
// Potential tail calls could cause overwriting of argument stack slots.
const bool IsImmutable =
!(getTargetMachine().Options.GuaranteedTailCallOpt &&
(CallConv == CallingConv::Fast));
int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue ArgValue =
DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
// On AIX a minimum of 8 words is saved to the parameter save area.
const unsigned MinParameterSaveArea = 8 * PtrByteSize;
// Area that is at least reserved in the caller of this function.
unsigned CallerReservedArea =
std::max(CCInfo.getNextStackOffset(), LinkageSize + MinParameterSaveArea);
// Set the size that is at least reserved in caller of this function. Tail
// call optimized function's reserved stack space needs to be aligned so
// that taking the difference between two stack areas will result in an
// aligned stack.
CallerReservedArea =
EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
if (isVarArg) {
MFI.CreateFixedObject(PtrByteSize, CCInfo.getNextStackOffset(), true));
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
PPC::R7, PPC::R8, PPC::R9, PPC::R10};
static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
PPC::X7, PPC::X8, PPC::X9, PPC::X10};
const unsigned NumGPArgRegs = array_lengthof(IsPPC64 ? GPR_64 : GPR_32);
// The fixed integer arguments of a variadic function are stored to the
// VarArgsFrameIndex on the stack so that they may be loaded by
// dereferencing the result of va_next.
for (unsigned GPRIndex =
(CCInfo.getNextStackOffset() - LinkageSize) / PtrByteSize;
GPRIndex < NumGPArgRegs; ++GPRIndex) {
const unsigned VReg =
IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)
: MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
// Increment the address for the next argument to store.
SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
return Chain;
SDValue PPCTargetLowering::LowerCall_AIX(
SDValue Chain, SDValue Callee, CallFlags CFlags,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
const CallBase *CB) const {
// See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
// AIX ABI stack frame layout.
assert((CFlags.CallConv == CallingConv::C ||
CFlags.CallConv == CallingConv::Cold ||
CFlags.CallConv == CallingConv::Fast) &&
"Unexpected calling convention!");
if (CFlags.IsPatchPoint)
report_fatal_error("This call type is unimplemented on AIX.");
const PPCSubtarget& Subtarget =
static_cast<const PPCSubtarget&>(DAG.getSubtarget());
if (Subtarget.hasQPX())
report_fatal_error("QPX is not supported on AIX.");
if (Subtarget.hasAltivec())
report_fatal_error("Altivec support is unimplemented on AIX.");
MachineFunction &MF = DAG.getMachineFunction();
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
// Reserve space for the linkage save area (LSA) on the stack.
// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
// [SP][CR][LR][2 x reserved][TOC].
// The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
const bool IsPPC64 = Subtarget.isPPC64();
const EVT PtrVT = getPointerTy(DAG.getDataLayout());
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
CCInfo.AnalyzeCallOperands(Outs, CC_AIX);
// The prolog code of the callee may store up to 8 GPR argument registers to
// the stack, allowing va_start to index over them in memory if the callee
// is variadic.
// Because we cannot tell if this is needed on the caller side, we have to
// conservatively assume that it is needed. As such, make sure we have at
// least enough stack space for the caller to store the 8 GPRs.
const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
const unsigned NumBytes = std::max(LinkageSize + MinParameterSaveAreaSize,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass.
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
// Set up a copy of the stack pointer for loading and storing any
// arguments that may not fit in the registers available for argument
// passing.
const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)
: DAG.getRegister(PPC::R1, MVT::i32);
for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
const unsigned ValNo = ArgLocs[I].getValNo();
SDValue Arg = OutVals[ValNo];
ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;
if (Flags.isByVal()) {
const unsigned ByValSize = Flags.getByValSize();
// Nothing to do for zero-sized ByVals on the caller side.
if (!ByValSize) {
auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,
(LoadOffset != 0)
? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
: Arg,
MachinePointerInfo(), VT);
unsigned LoadOffset = 0;
// Initialize registers, which are fully occupied by the by-val argument.
while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
SDValue Load = GetLoad(PtrVT, LoadOffset);
LoadOffset += PtrByteSize;
const CCValAssign &ByValVA = ArgLocs[I++];
assert(ByValVA.getValNo() == ValNo &&
"Unexpected location for pass-by-value argument.");
RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), Load));
if (LoadOffset == ByValSize)
// There must be one more loc to handle the remainder.
assert(ArgLocs[I].getValNo() == ValNo &&
"Expected additional location for by-value argument.");
if (ArgLocs[I].isMemLoc()) {
assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
const CCValAssign &ByValVA = ArgLocs[I++];
ISD::ArgFlagsTy MemcpyFlags = Flags;
// Only memcpy the bytes that don't pass in register.
MemcpyFlags.setByValSize(ByValSize - LoadOffset);
Chain = CallSeqStart = createMemcpyOutsideCallSeq(
(LoadOffset != 0) ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
: Arg,
DAG.getObjectPtrOffset(dl, StackPtr, ByValVA.getLocMemOffset()),
CallSeqStart, MemcpyFlags, DAG, dl);
// Initialize the final register residue.
// Any residue that occupies the final by-val arg register must be
// left-justified on AIX. Loads must be a power-of-2 size and cannot be
// larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
// 2 and 1 byte loads.
const unsigned ResidueBytes = ByValSize % PtrByteSize;
assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
"Unexpected register residue for by-value argument.");
SDValue ResidueVal;
for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
const unsigned N = PowerOf2Floor(ResidueBytes - Bytes);
const MVT VT =
N == 1 ? MVT::i8
: ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
SDValue Load = GetLoad(VT, LoadOffset);
LoadOffset += N;
Bytes += N;
// By-val arguments are passed left-justfied in register.
// Every load here needs to be shifted, otherwise a full register load
// should have been used.
assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
"Unexpected load emitted during handling of pass-by-value "
unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
EVT ShiftAmountTy =
getShiftAmountTy(Load->getValueType(0), DAG.getDataLayout());
SDValue SHLAmt = DAG.getConstant(NumSHLBits, dl, ShiftAmountTy);
SDValue ShiftedLoad =
DAG.getNode(ISD::SHL, dl, Load.getValueType(), Load, SHLAmt);
ResidueVal = ResidueVal ? DAG.getNode(ISD::OR, dl, PtrVT, ResidueVal,
: ShiftedLoad;
const CCValAssign &ByValVA = ArgLocs[I++];
RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), ResidueVal));
CCValAssign &VA = ArgLocs[I++];
const MVT LocVT = VA.getLocVT();
const MVT ValVT = VA.getValVT();
switch (VA.getLocInfo()) {
report_fatal_error("Unexpected argument extension type.");
case CCValAssign::Full:
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
if (VA.isRegLoc() && !VA.needsCustom()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
if (VA.isMemLoc()) {
SDValue PtrOff =
DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
// Custom handling is used for GPR initializations for vararg float
// arguments.
assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
ValVT.isFloatingPoint() && LocVT.isInteger() &&
"Unexpected register handling for calling convention.");
SDValue ArgAsInt =
DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);
if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
// f32 in 32-bit GPR
// f64 in 64-bit GPR
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
else if (Arg.getValueType().getSizeInBits() < LocVT.getSizeInBits())
// f32 in 64-bit GPR.
VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));
else {
// f64 in two 32-bit GPRs
// The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
"Unexpected custom register for argument!");
CCValAssign &GPR1 = VA;
SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,
DAG.getConstant(32, dl, MVT::i8));
GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));
if (I != E) {
// If only 1 GPR was available, there will only be one custom GPR and
// the argument will also pass in memory.
CCValAssign &PeekArg = ArgLocs[I];
if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
CCValAssign &GPR2 = ArgLocs[I++];
GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
// For indirect calls, we need to save the TOC base to the stack for
// restoration after the call.
if (CFlags.IsIndirect) {
assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
const unsigned TOCSaveOffset =
SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT);
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT);
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
Chain = DAG.getStore(
Val.getValue(1), dl, Val, AddPtr,
MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (auto Reg : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag);
InFlag = Chain.getValue(1);
const int SPDiff = 0;
return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
Callee, SPDiff, NumBytes, Ins, InVals, CB);
PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(
Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
? RetCC_PPC_Cold
: RetCC_PPC);
PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
(Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
? RetCC_PPC_Cold
: RetCC_PPC);
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
// Copy the result values into the output registers.
for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[RealResIdx];
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
bool isLittleEndian = Subtarget.isLittleEndian();
// Legalize ret f64 -> ret 2 x i32.
SDValue SVal =
DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));
Flag = Chain.getValue(1);
VA = RVLocs[++i]; // skip ahead to next loc
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
} else
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
if (Flag.getNode())
return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
SelectionDAG &DAG) const {
SDLoc dl(Op);
// Get the correct type for integers.
EVT IntVT = Op.getValueType();
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue FPSIdx = getFramePointerFrameIndex(DAG);
// Build a DYNAREAOFFSET node.
SDValue Ops[2] = {Chain, FPSIdx};
SDVTList VTs = DAG.getVTList(IntVT);
return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
SelectionDAG &DAG) const {
// When we pop the dynamic allocation we need to restore the SP link.
SDLoc dl(Op);
// Get the correct type for pointers.
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// Construct the stack pointer operand.
bool isPPC64 = Subtarget.isPPC64();
unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
SDValue StackPtr = DAG.getRegister(SP, PtrVT);
// Get the operands for the STACKRESTORE.
SDValue Chain = Op.getOperand(0);
SDValue SaveSP = Op.getOperand(1);
// Load the old link SP.
SDValue LoadLinkSP =
DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());
// Restore the stack pointer.
Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
// Store the old link SP.
return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool isPPC64 = Subtarget.isPPC64();
EVT PtrVT = getPointerTy(MF.getDataLayout());
// Get current frame pointer save index. The users of this index will be
// primarily DYNALLOC instructions.
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
int RASI = FI->getReturnAddrSaveIndex();
// If the frame pointer save index hasn't been defined yet.
if (!RASI) {
// Find out what the fix offset of the frame pointer save area.
int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
// Allocate the frame index for frame pointer save area.
RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
// Save the result.
return DAG.getFrameIndex(RASI, PtrVT);
PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool isPPC64 = Subtarget.isPPC64();
EVT PtrVT = getPointerTy(MF.getDataLayout());
// Get current frame pointer save index. The users of this index will be
// primarily DYNALLOC instructions.
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
int FPSI = FI->getFramePointerSaveIndex();
// If the frame pointer save index hasn't been defined yet.
if (!FPSI) {
// Find out what the fix offset of the frame pointer save area.
int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
// Allocate the frame index for frame pointer save area.
FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
// Save the result.
return DAG.getFrameIndex(FPSI, PtrVT);
SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
// Get the inputs.
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
SDLoc dl(Op);
// Get the correct type for pointers.
EVT PtrVT = getPointerTy(DAG.getDataLayout());
// Negate the size.
SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
DAG.getConstant(0, dl, PtrVT), Size);
// Construct a node for the frame pointer save index.
SDValue FPSIdx = getFramePointerFrameIndex(DAG);
SDValue Ops[3] = { Chain, NegSize, FPSIdx };
SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
if (hasInlineStackProbe(MF))
return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops);
return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool isPPC64 = Subtarget.isPPC64();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
return DAG.getFrameIndex(FI, PtrVT);
SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
DAG.getVTList(MVT::i32, MVT::Other),
Op.getOperand(0), Op.getOperand(1));
SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
Op.getOperand(0), Op.getOperand(1));
SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVectorLoad(Op, DAG);
assert(Op.getValueType() == MVT::i1 &&
"Custom lowering only for i1 loads");
// First, load 8 bits into 32 bits, then truncate to 1 bit.
SDLoc dl(Op);
LoadSDNode *LD = cast<LoadSDNode>(Op);
SDValue Chain = LD->getChain();
SDValue BasePtr = LD->getBasePtr();
MachineMemOperand *MMO = LD->getMemOperand();
SDValue NewLD =
DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
BasePtr, MVT::i8, MMO);
SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
return DAG.getMergeValues(Ops, dl);
SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (Op.getOperand(1).getValueType().isVector())
return LowerVectorStore(Op, DAG);
assert(Op.getOperand(1).getValueType() == MVT::i1 &&
"Custom lowering only for i1 stores");
// First, zero extend to 32 bits, then use a truncating store to 8 bits.
SDLoc dl(Op);
StoreSDNode *ST = cast<StoreSDNode>(Op);
SDValue Chain = ST->getChain();
SDValue BasePtr = ST->getBasePtr();
SDValue Value = ST->getValue();
MachineMemOperand *MMO = ST->getMemOperand();
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
// FIXME: Remove this once the ANDI glue bug is fixed:
SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType() == MVT::i1 &&
"Custom lowering only for i1 results");
SDLoc DL(Op);
return DAG.getNode(PPCISD::ANDI_rec_1_GT_BIT, DL, MVT::i1, Op.getOperand(0));
SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
SelectionDAG &DAG) const {
// Implements a vector truncate that fits in a vector register as a shuffle.
// We want to legalize vector truncates down to where the source fits in
// a vector register (and target is therefore smaller than vector register
// size). At that point legalization will try to custom lower the sub-legal
// result and get here - where we can contain the truncate as a single target
// operation.
// For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
// <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
// We will implement it for big-endian ordering as this (where x denotes
// undefined):
// < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
// < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
// The same operation in little-endian ordering will be:
// <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
// <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
assert(Op.getValueType().isVector() && "Vector type expected.");
SDLoc DL(Op);
SDValue N1 = Op.getOperand(0);
unsigned SrcSize = N1.getValueType().getSizeInBits();
assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
EVT TrgVT = Op.getValueType();
unsigned TrgNumElts = TrgVT.getVectorNumElements();
EVT EltVT = TrgVT.getVectorElementType();
unsigned WideNumElts = 128 / EltVT.getSizeInBits();
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
// First list the elements we want to keep.
unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
SmallVector<int, 16> ShuffV;
if (Subtarget.isLittleEndian())
for (unsigned i = 0; i < TrgNumElts; ++i)
ShuffV.push_back(i * SizeMult);
for (unsigned i = 1; i <= TrgNumElts; ++i)
ShuffV.push_back(i * SizeMult - 1);
// Populate the remaining elements with undefs.
for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
// ShuffV.push_back(i + WideNumElts);
ShuffV.push_back(WideNumElts + 1);
SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
/// possible.
SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
// Not FP? Not a fsel.
if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
return Op;
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
EVT ResVT = Op.getValueType();
EVT CmpVT = Op.getOperand(0).getValueType();
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
SDLoc dl(Op);
SDNodeFlags Flags = Op.getNode()->getFlags();
// We have xsmaxcdp/xsmincdp which are OK to emit even in the
// presence of infinities.
if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
switch (CC) {
case ISD::SETGT:
return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS);
case ISD::SETLT:
return DAG.getNode(PPCISD::XSMINCDP, dl, Op.getValueType(), LHS, RHS);
// We might be able to do better than this under some circumstances, but in
// general, fsel-based lowering of select is a finite-math-only optimization.
// For more information, see section F.3 of the 2.06 ISA specification.
// With ISA 3.0
if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||
(!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()))
return Op;
// If the RHS of the comparison is a 0.0, we don't need to do the
// subtraction at all.
SDValue Sel1;
if (isFloatingPointZero(RHS))
switch (CC) {
default: break; // SETUO etc aren't handled by fsel.
case ISD::SETNE:
std::swap(TV, FV);
case ISD::SETEQ:
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
case ISD::SETLT:
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
case ISD::SETGE:
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
case ISD::SETGT:
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
case ISD::SETLE:
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
SDValue Cmp;
switch (CC) {
default: break; // SETUO etc aren't handled by fsel.
case ISD::SETNE:
std::swap(TV, FV);
case ISD::SETEQ:
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
case ISD::SETLT:
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
case ISD::SETGE:
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
case ISD::SETGT:
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
case ISD::SETLE:
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
return Op;
void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
SelectionDAG &DAG,
const SDLoc &dl) const {
SDValue Src = Op.getOperand(0);
if (Src.getValueType() == MVT::f32)
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
SDValue Tmp;
switch (Op.getSimpleValueType().SimpleTy) {
default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
case MVT::i32:
Tmp = DAG.getNode(
Op.getOpcode() == ISD::FP_TO_SINT
dl, MVT::f64, Src);
case MVT::i64:
assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
"i64 FP_TO_UINT is supported only with FPCVT");
Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
dl, MVT::f64, Src);
// Convert the FP value to an int value through memory.
bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
(Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
// Emit a store to the stack slot.
SDValue Chain;
Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));
if (i32Stack) {
MachineFunction &MF = DAG.getMachineFunction();
Alignment = Align(4);
MachineMemOperand *MMO =
MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
} else
Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI, Alignment);
// Result is a load from the stack slot. If loading 4 bytes, make sure to
// add in a bias on big endian.
if (Op.getValueType() == MVT::i32 && !i32Stack) {
FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
DAG.getConstant(4, dl, FIPtr.getValueType()));
MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
RLI.Chain = Chain;
RLI.Ptr = FIPtr;
RLI.Alignment = Alignment;
/// Custom lowers floating point to integer conversions to use
/// the direct move instructions available in ISA 2.07 to avoid the
/// need for load/store combinations.
SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
SelectionDAG &DAG,
const SDLoc &dl) const {
SDValue Src = Op.getOperand(0);
if (Src.getValueType() == MVT::f32)
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
SDValue Tmp;
switch (Op.getSimpleValueType().SimpleTy) {
default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
case MVT::i32:
Tmp = DAG.getNode(
Op.getOpcode() == ISD::FP_TO_SINT
dl, MVT::f64, Src);
Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
case MVT::i64:
assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
"i64 FP_TO_UINT is supported only with FPCVT");
Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
dl, MVT::f64, Src);
Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
return Tmp;
SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) const {
// FP to INT conversions are legal for f128.
if (Op->getOperand(0).getValueType() == MVT::f128)
return Op;
// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
// PPC (the libcall is not available).
if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
if (Op.getValueType() == MVT::i32) {
if (Op.getOpcode() == ISD::FP_TO_SINT) {
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
MVT::f64, Op.getOperand(0),
DAG.getIntPtrConstant(0, dl));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
MVT::f64, Op.getOperand(0),
DAG.getIntPtrConstant(1, dl));
// Add the two halves of the long double in round-to-zero mode.
SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
// Now use a smaller FP_TO_SINT.
return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
if (Op.getOpcode() == ISD::FP_TO_UINT) {
const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
// X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
// FIXME: generated code sucks.
// TODO: Are there fast-math-flags to propagate to this FSUB?
SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
Op.getOperand(0), Tmp);
True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
DAG.getConstant(0x80000000, dl, MVT::i32));
SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
return SDValue();
if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
return LowerFP_TO_INTDirectMove(Op, DAG, dl);
ReuseLoadInfo RLI;
LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
// We're trying to insert a regular store, S, and then a load, L. If the
// incoming value, O, is a load, we might just be able to have our load use the
// address used by O. However, we don't know if anything else will store to
// that address before we can load from it. To prevent this situation, we need
// to insert our load, L, into the chain as a peer of O. To do this, we give L
// the same chain operand as O, we create a token factor from the chain results
// of O and L, and we replace all uses of O's chain result with that token
// factor (see spliceIntoChain below for this last part).
bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
ReuseLoadInfo &RLI,
SelectionDAG &DAG,
ISD::LoadExtType ET) const {
SDLoc dl(Op);
bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
(Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
(ValidFPToUint || Op.getOpcode() == ISD::FP_TO_SINT) &&
Op.getOperand(0).getValueType())) {
LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
return true;
LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
return false;
if (LD->getMemoryVT() != MemVT)
return false;
RLI.Ptr = LD->getBasePtr();
if (LD->isIndexed() && !LD->getOffset().isUndef()) {
assert(LD->getAddressingMode() == ISD::PRE_INC &&
"Non-pre-inc AM on PPC?");
RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
RLI.Chain = LD->getChain();
RLI.MPI = LD->getPointerInfo();
RLI.IsDereferenceable = LD->isDereferenceable();
RLI.IsInvariant = LD->isInvariant();
RLI.Alignment = LD->getAlign();
RLI.AAInfo = LD->getAAInfo();
RLI.Ranges = LD->getRanges();
RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
return true;
// Given the head of the old chain, ResChain, insert a token factor containing
// it and NewResChain, and make users of ResChain now be users of that token
// factor.
// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
SDValue NewResChain,
SelectionDAG &DAG) const {
if (!ResChain)
SDLoc dl(NewResChain);
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
NewResChain, DAG.getUNDEF(MVT::Other));
assert(TF.getNode() != NewResChain.getNode() &&
"A new TF really is required here");
DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
/// Analyze profitability of direct move
/// prefer float load to int load plus direct move
/// when there is no integer use of int load
bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
SDNode *Origin = Op.getOperand(0).getNode();
if (Origin->getOpcode() != ISD::LOAD)
return true;
// If there is no LXSIBZX/LXSIHZX, like Power8,
// prefer direct move if the memory size is 1 or 2 bytes.
MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
return true;
for (SDNode::use_iterator UI = Origin->use_begin(),
UE = Origin->use_end();
UI != UE; ++UI) {
// Only look at the users of the loaded value.
if (UI.getUse().get().getResNo() != 0)
if (UI->getOpcode() != ISD::SINT_TO_FP &&
UI->getOpcode() != ISD::UINT_TO_FP)
return true;
return false;
/// Custom lowers integer to floating point conversions to use
/// the direct move instructions available in ISA 2.07 to avoid the
/// need for load/store combinations.
SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
SelectionDAG &DAG,
const SDLoc &dl) const {
assert((Op.getValueType() == MVT::f32 ||
Op.getValueType() == MVT::f64) &&
"Invalid floating point type as target of conversion");
assert(Subtarget.hasFPCVT() &&
"Int to FP conversions with direct moves require FPCVT");
SDValue FP;
SDValue Src = Op.getOperand(0);
bool SinglePrec = Op.getValueType() == MVT::f32;
bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
if (WordInt) {
dl, MVT::f64, Src);
FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
else {
FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
return FP;
static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
EVT VecVT = Vec.getValueType();
assert(VecVT.isVector() && "Expected a vector type.");
assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
EVT EltVT = VecVT.getVectorElementType();
unsigned WideNumElts = 128 / EltVT.getSizeInBits();
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
SmallVector<SDValue, 16> Ops(NumConcat);
Ops[0] = Vec;
SDValue UndefVec = DAG.getUNDEF(VecVT);
for (unsigned i = 1; i < NumConcat; ++i)
Ops[i] = UndefVec;
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) const {
unsigned Opc = Op.getOpcode();
assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) &&
"Unexpected conversion type");
assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
"Supports conversions to v2f64/v4f32 only.");
bool SignedConv = Opc == ISD::SINT_TO_FP;
bool FourEltRes = Op.getValueType() == MVT::v4f32;
SDValue Wide = widenVec(DAG, Op.getOperand(0), dl);
EVT WideVT = Wide.getValueType();
unsigned WideNumElts = WideVT.getVectorNumElements();
MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
SmallVector<int, 16> ShuffV;
for (unsigned i = 0; i < WideNumElts; ++i)
ShuffV.push_back(i + WideNumElts);
int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
int SaveElts = FourEltRes ? 4 : 2;
if (Subtarget.isLittleEndian())
for (int i = 0; i < SaveElts; i++)
ShuffV[i * Stride] = i;
for (int i = 1; i <= SaveElts; i++)
ShuffV[i * Stride - 1] = i - 1;
SDValue ShuffleSrc2 =
SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);
SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
SDValue Extend;
if (SignedConv) {
Arrange = DAG.getBitcast(IntermediateVT, Arrange);
EVT ExtVT = Op.getOperand(0).getValueType();
if (Subtarget.hasP9Altivec())
ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),
Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,
} else
Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);
return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT InVT = Op.getOperand(0).getValueType();
EVT OutVT = Op.getValueType();
if (OutVT.isVector() && OutVT.isFloatingPoint() &&
isOperationCustom(Op.getOpcode(), InVT))
return LowerINT_TO_FPVector(Op, DAG, dl);
// Conversions to f128 are legal.
if (Op.getValueType() == MVT::f128)
return Op;
if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
return SDValue();
SDValue Value = Op.getOperand(0);
// The values are now known to be -1 (false) or 1 (true). To convert this
// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
// This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
if (Op.getValueType() != MVT::v4f64)
Value = DAG.getNode(ISD::FP_ROUND, dl,
Op.getValueType(), Value,
DAG.getIntPtrConstant(1, dl));
return Value;
// Don't handle ppc_fp128 here; let it be lowered to a libcall.
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
return SDValue();
if (Op.getOperand(0).getValueType() == MVT::i1)
return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
DAG.getConstantFP(1.0, dl, Op.getValueType()),
DAG.getConstantFP(0.0, dl, Op.getValueType()));
// If we have direct moves, we can do all the conversion, skip the store/load
// however, without FPCVT we can't do most conversions.
if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
Subtarget.isPPC64() && Subtarget.hasFPCVT())
return LowerINT_TO_FPDirectMove(Op, DAG, dl);
assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
"UINT_TO_FP is supported only with FPCVT");
// If we have FCFIDS, then use it when converting to single-precision.
// Otherwise, convert to double-precision and then round.
unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
? MVT::f32
: MVT::f64;
if (Op.getOperand(0).getValueType() == MVT::i64) {
SDValue SINT = Op.getOperand(0);
// When converting to single-precision, we actually need to convert
// to double-precision first and then round to single-precision.
// To avoid double-rounding effects during that operation, we have
// to prepare the input operand. Bits that might be truncated when
// converting to double-precision are replaced by a bit that won't
// be lost at this stage, but is below the single-precision rounding
// position.
// However, if -enable-unsafe-fp-math is in effect, accept double
// rounding to avoid the extra overhead.
if (Op.getValueType() == MVT::f32 &&
!Subtarget.hasFPCVT() &&
!DAG.getTarget().Options.UnsafeFPMath) {
// Twiddle input to make sure the low 11 bits are zero. (If this
// is the case, we are guaranteed the value will fit into the 53 bit
// mantissa of an IEEE double-precision value without rounding.)
// If any of those low 11 bits were not zero originally, make sure
// bit 12 (value 2048) is set instead, so that the final rounding
// to single-precision gets the correct result.
SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
SINT, DAG.getConstant(2047, dl, MVT::i64));
Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
Round, DAG.getConstant(2047, dl, MVT::i64));
Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
Round = DAG.getNode(ISD::AND, dl, MVT::i64,
Round, DAG.getConstant(-2048, dl, MVT::i64));
// However, we cannot use that value unconditionally: if the magnitude
// of the input value is small, the bit-twiddling we did above might
// end up visibly changing the output. Fortunately, in that case, we
// don't need to twiddle bits since the original input will convert
// exactly to double-precision floating-point already. Therefore,
// construct a conditional to use the original value if the top 11
// bits are all sign-bit copies, and use the rounded value computed
// above otherwise.
SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
SINT, DAG.getConstant(53, dl, MVT::i32));
Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
Cond, DAG.getConstant(1, dl, MVT::i64));
Cond = DAG.getSetCC(
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
ReuseLoadInfo RLI;
SDValue Bits;
MachineFunction &MF = DAG.getMachineFunction();
if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
} else if (Subtarget.hasLFIWAX() &&
canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
MachineMemOperand *MMO =
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
DAG.getVTList(MVT::f64, MVT::Other),
Ops, MVT::i32, MMO);
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
} else if (Subtarget.hasFPCVT() &&
canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
MachineMemOperand *MMO =
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
DAG.getVTList(MVT::f64, MVT::Other),
Ops, MVT::i32, MMO);
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
} else if (((Subtarget.hasLFIWAX() &&
SINT.getOpcode() == ISD::SIGN_EXTEND) ||
(Subtarget.hasFPCVT() &&
SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
SINT.getOperand(0).getValueType() == MVT::i32) {
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue Store =
DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
DAG.getMachineFunction(), FrameIdx));
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
"Expected an i32 store");
RLI.Ptr = FIdx;
RLI.Chain = Store;
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
RLI.Alignment = Align(4);
MachineMemOperand *MMO =
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
dl, DAG.getVTList(MVT::f64, MVT::Other),
Ops, MVT::i32, MMO);
} else
Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
FP = DAG.getNode(ISD::FP_ROUND, dl,
MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
return FP;
assert(Op.getOperand(0).getValueType() == MVT::i32 &&
"Unhandled INT_TO_FP type in custom expander!");
// Since we only generate this in 64-bit mode, we can take advantage of
// 64-bit registers. In particular, sign extend the input value into the
// 64-bit register with extsw, store the WHOLE 64-bit value into the stack
// then lfd it and fcfid it.
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT PtrVT = getPointerTy(MF.getDataLayout());
SDValue Ld;
if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
ReuseLoadInfo RLI;
bool ReusingLoad;
if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
DAG))) {
int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue Store =
DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
DAG.getMachineFunction(), FrameIdx));
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
"Expected an i32 store");
RLI.Ptr = FIdx;
RLI.Chain = Store;
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
RLI.Alignment = Align(4);
MachineMemOperand *MMO =
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
dl, DAG.getVTList(MVT::f64, MVT::Other),
Ops, MVT::i32, MMO);
if (ReusingLoad)
spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
} else {
assert(Subtarget.isPPC64() &&
"i32->FP without LFIWAX supported only on PPC64");
int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
// STD the extended value into the stack slot.
SDValue Store = DAG.getStore(
DAG.getEntryNode(), dl, Ext64, FIdx,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
// Load the value as a double.
Ld = DAG.getLoad(
MVT::f64, dl, Store, FIdx,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
// FCFID it and return it.
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
DAG.getIntPtrConstant(0, dl));
return FP;
SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
The rounding mode is in bits 30:31 of FPSR, and has the following
00 Round to nearest
01 Round to 0
10 Round to +inf
11 Round to -inf
FLT_ROUNDS, on the other hand, expects the following:
-1 Undefined
0 Round to 0
1 Round to nearest
2 Round to +inf
3 Round to -inf
To perform the conversion, we do:
((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
MachineFunction &MF = DAG.getMachineFunction();
EVT VT = Op.getValueType();
EVT PtrVT = getPointerTy(MF.getDataLayout());
// Save FP Control Word to register
SDValue Chain = Op.getOperand(0);
SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
Chain = MFFS.getValue(1);
// Save FP register to stack slot
int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
// Load FP Control Word from low 32 bits of stack slot.
SDValue Four = DAG.getConstant(4, dl, PtrVT);
SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
SDValue CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
Chain = CWD.getValue(1);
// Transform as necessary
SDValue CWD1 =
DAG.getNode(ISD::AND, dl, MVT::i32,
CWD, DAG.getConstant(3, dl, MVT::i32));
SDValue CWD2 =
DAG.getNode(ISD::SRL, dl, MVT::i32,
DAG.getNode(ISD::AND, dl, MVT::i32,
DAG.getNode(ISD::XOR, dl, MVT::i32,
CWD, DAG.getConstant(3, dl, MVT::i32)),
DAG.getConstant(3, dl, MVT::i32)),
DAG.getConstant(1, dl, MVT::i32));
SDValue RetVal =
DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
RetVal =
DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
dl, VT, RetVal);
return DAG.getMergeValues({RetVal, Chain}, dl);
SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
SDLoc dl(Op);
assert(Op.getNumOperands() == 3 &&
VT == Op.getOperand(1).getValueType() &&
"Unexpected SHL!");
// Expand into a bunch of logical ops. Note that these ops
// depend on the PPC behavior for oversized shift amounts.
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
SDValue Amt = Op.getOperand(2);
EVT AmtVT = Amt.getValueType();
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
DAG.getConstant(-BitWidth, dl, AmtVT));
SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
SDValue OutOps[] = { OutLo, OutHi };
return DAG.getMergeValues(OutOps, dl);
SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc dl(Op);
unsigned BitWidth = VT.getSizeInBits();
assert(Op.getNumOperands() == 3 &&
VT == Op.getOperand(1).getValueType() &&
"Unexpected SRL!");
// Expand into a bunch of logical ops. Note that these ops
// depend on the PPC behavior for oversized shift amounts.
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
SDValue Amt = Op.getOperand(2);
EVT AmtVT = Amt.getValueType();
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
DAG.getConstant(-BitWidth, dl, AmtVT));
SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
SDValue OutOps[] = { OutLo, OutHi };
return DAG.getMergeValues(OutOps, dl);
SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT VT = Op.getValueType();
unsigned BitWidth = VT.getSizeInBits();
assert(Op.getNumOperands() == 3 &&
VT == Op.getOperand(1).getValueType() &&
"Unexpected SRA!");
// Expand into a bunch of logical ops, followed by a select_cc.
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
SDValue Amt = Op.getOperand(2);
EVT AmtVT = Amt.getValueType();
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
DAG.getConstant(-BitWidth, dl, AmtVT));
SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
Tmp4, Tmp6, ISD::SETLE);
SDValue OutOps[] = { OutLo, OutHi };
return DAG.getMergeValues(OutOps, dl);
// Vector related lowering.
/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
/// element size of SplatSize. Cast the result to VT.
static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
SelectionDAG &DAG, const SDLoc &dl) {
static const MVT VTys[] = { // canonical VT to use for each size.
MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
// For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
if (Val == ((1LU << (SplatSize * 8)) - 1)) {
SplatSize = 1;
Val = 0xFF;
EVT CanonicalVT = VTys[SplatSize-1];
// Build a canonical splat for this value.
return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
/// specified intrinsic ID.
static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
const SDLoc &dl, EVT DestVT = MVT::Other) {
if (DestVT == MVT::Other) DestVT = Op.getValueType();
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
DAG.getConstant(IID, dl, MVT::i32), Op);
/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
/// specified intrinsic ID.
static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
SelectionDAG &DAG, const SDLoc &dl,
EVT DestVT = MVT::Other) {
if (DestVT == MVT::Other) DestVT = LHS.getValueType();
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
/// specified intrinsic ID.
static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
EVT DestVT = MVT::Other) {
if (DestVT == MVT::Other) DestVT = Op0.getValueType();
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
/// amount. The result has the specified value type.
static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
SelectionDAG &DAG, const SDLoc &dl) {
// Force LHS/RHS to be the right type.
LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
int Ops[16];
for (unsigned i = 0; i != 16; ++i)
Ops[i] = i + Amt;
SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
return DAG.getNode(ISD::BITCAST, dl, VT, T);
/// Do we have an efficient pattern in a .td file for this node?
/// \param V - pointer to the BuildVectorSDNode being matched
/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
/// the opposite is true (expansion is beneficial) are:
/// - The node builds a vector out of integers that are not 32 or 64-bits
/// - The node builds a vector out of constants
/// - The node is a "load-and-splat"
/// In all other cases, we will choose to keep the BUILD_VECTOR.
static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
bool HasDirectMove,
bool HasP8Vector) {
EVT VecVT = V->getValueType(0);
bool RightType = VecVT == MVT::v2f64 ||
(HasP8Vector && VecVT == MVT::v4f32) ||
(HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
if (!RightType)
return false;
bool IsSplat = true;
bool IsLoad = false;
SDValue Op0 = V->getOperand(0);
// This function is called in a block that confirms the node is not a constant
// splat. So a constant BUILD_VECTOR here means the vector is built out of
// different constants.
if (V->isConstant())
return false;
for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
if (V->getOperand(i).isUndef())
return false;
// We want to expand nodes that represent load-and-splat even if the
// loaded value is a floating point truncation or conversion to int.
if (V->getOperand(i).getOpcode() == ISD::LOAD ||
(V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
(V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
(V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
IsLoad = true;
// If the operands are different or the input is not a load and has more
// uses than just this BV node, then it isn't a splat.
if (V->getOperand(i) != Op0 ||
(!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
IsSplat = false;
return !(IsSplat && IsLoad);
// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Op0 = Op->getOperand(0);
if ((Op.getValueType() != MVT::f128) ||
(Op0.getOpcode() != ISD::BUILD_PAIR) ||
(Op0.getOperand(0).getValueType() != MVT::i64) ||
(Op0.getOperand(1).getValueType() != MVT::i64))
return SDValue();
return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
-static const SDValue *getNormalLoadInput(const SDValue &Op) {
+static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
const SDValue *InputLoad = &Op;
if (InputLoad->getOpcode() == ISD::BITCAST)
InputLoad = &InputLoad->getOperand(0);
if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
- InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
+ InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
+ IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
InputLoad = &InputLoad->getOperand(0);
+ }
if (InputLoad->getOpcode() != ISD::LOAD)
return nullptr;
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
// Convert the argument APFloat to a single precision APFloat if there is no
// loss in information during the conversion to single precision APFloat and the
// resulting number is not a denormal number. Return true if successful.
bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
APFloat APFloatToConvert = ArgAPFloat;
bool LosesInfo = true;
APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
if (Success)
ArgAPFloat = APFloatToConvert;
return Success;
// Bitcast the argument APInt to a double and convert it to a single precision
// APFloat, bitcast the APFloat to an APInt and assign it to the original
// argument if there is no loss in information during the conversion from
// double to single precision APFloat and the resulting number is not a denormal
// number. Return true if successful.
bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
double DpValue = ArgAPInt.bitsToDouble();
APFloat APFloatDp(DpValue);
bool Success = convertToNonDenormSingle(APFloatDp);
if (Success)
ArgAPInt = APFloatDp.bitcastToAPInt();
return Success;
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
// this case more efficiently than a constant pool load, lower it to the
// sequence of ops that should be used.
SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
// We first build an i32 vector, load it into a QPX register,
// then convert it to a floating-point vector and compare it
// to a zero vector to get the boolean result.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
assert(BVN->getNumOperands() == 4 &&
"BUILD_VECTOR for v4i1 does not have 4 operands");
bool IsConst = true;
for (unsigned i = 0; i < 4; ++i) {
if (BVN->getOperand(i).isUndef()) continue;
if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
IsConst = false;
if (IsConst) {
Constant *One =
ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
Constant *NegOne =
ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
Constant *CV[4];
for (unsigned i = 0; i < 4; ++i) {
if (BVN->getOperand(i).isUndef())
CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
else if (isNullConstant(BVN->getOperand(i)))
CV[i] = NegOne;
CV[i] = One;
Constant *CP = ConstantVector::get(CV);
SDValue CPIdx =
DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16));
SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
return DAG.getMemIntrinsicNode(
PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
SmallVector<SDValue, 4> Stores;
for (unsigned i = 0; i < 4; ++i) {
if (BVN->getOperand(i).isUndef()) continue;
unsigned Offset = 4*i;
SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
if (StoreSize > 4) {
DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
PtrInfo.getWithOffset(Offset), MVT::i32));
} else {
SDValue StoreValue = BVN->getOperand(i);
if (StoreSize < 4)
StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
SDValue StoreChain;
if (!Stores.empty())
StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
StoreChain = DAG.getEntryNode();
// Now load from v4i32 into the QPX register; this will extend it to
// v4i64 but not yet convert it to a floating point. Nevertheless, this
// is typed as v4f64 because the QPX register integer states are not
// explicitly represented.
SDValue Ops[] = {StoreChain,
DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other});
SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
dl, VTs, Ops, MVT::v4i32, PtrInfo);
LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);
return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
// All other QPX vectors are handled by generic code.
if (Subtarget.hasQPX())
return SDValue();
// Check if this is a splat of a constant value.
APInt APSplatBits, APSplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
bool BVNIsConstantSplat =
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
// If it is a splat of a double, check if we can shrink it to a 32 bit
// non-denormal float which when converted back to double gives us the same
// double. This is to exploit the XXSPLTIDP instruction.
if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
(SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
convertToNonDenormSingle(APSplatBits)) {
SDValue SplatNode = DAG.getNode(
DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
return DAG.getBitcast(Op.getValueType(), SplatNode);
if (!BVNIsConstantSplat || SplatBitSize > 32) {
- const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
+ bool IsPermutedLoad = false;
+ const SDValue *InputLoad =
+ getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
// Handle load-and-splat patterns as we have instructions that will do this
// in one go.
if (InputLoad && DAG.isSplatValue(Op, true)) {
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
// We have handling for 4 and 8 byte elements.
unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
// Checking for a single use of this load, we have to check for vector
// width (128 bits) / ElementSize uses (since each operand of the
// BUILD_VECTOR is a separate use of the value.
if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
((Subtarget.hasVSX() && ElementSize == 64) ||
(Subtarget.hasP9Vector() && ElementSize == 32))) {
SDValue Ops[] = {
LD->getChain(), // Chain
LD->getBasePtr(), // Ptr
DAG.getValueType(Op.getValueType()) // VT
DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
DAG.getVTList(Op.getValueType(), MVT::Other),
Ops, LD->getMemoryVT(), LD->getMemOperand());
// BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
// lowered to VSX instructions under certain conditions.
// Without VSX, there is no pattern more efficient than expanding the node.
if (Subtarget.hasVSX() &&
haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
return Op;
return SDValue();
uint64_t SplatBits = APSplatBits.getZExtValue();
uint64_t SplatUndef = APSplatUndef.getZExtValue();
unsigned SplatSize = SplatBitSize / 8;
// First, handle single instruction cases.
// All zeros?
if (SplatBits == 0) {
// Canonicalize all zero vectors to be v4i32.
if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
return Op;
// We have XXSPLTIW for constant splats four bytes wide.
// Given vector length is a multiple of 4, 2-byte splats can be replaced
// with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
// make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
// turned into a 4-byte splat of 0xABABABAB.
if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2,
Op.getValueType(), DAG, dl);
if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
// We have XXSPLTIB for constant splats one byte wide.
if (Subtarget.hasP9Vector() && SplatSize == 1)
return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
if (SextVal >= -16 && SextVal <= 15)
return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
// Two instruction sequences.
// If this value is in the range [-32,30] and is even, use:
// VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
// If this value is in the range [17,31] and is odd, use:
// VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
// If this value is in the range [-31,-17] and is odd, use:
// VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
// Note the last two are three-instruction sequences.
if (SextVal >= -32 && SextVal <= 31) {
// To avoid having these optimizations undone by constant folding,
// we convert to a pseudo that will be expanded later into one of
// the above forms.
SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
EVT VT = (SplatSize == 1 ? MVT::v16i8 :
(SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
if (VT == Op.getValueType())
return RetVal;
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
// If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
// 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
// for fneg/fabs.
if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
// Make -1 and vspltisw -1:
SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);
// Make the VSLW intrinsic, computing 0x8000_0000.
SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
OnesV, DAG, dl);
// xor by OnesV to invert it.
Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
// Check to see if this is a wide variety of vsplti*, binop self cases.
static const signed char SplatCsts[] = {
-1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
-8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
// Indirect through the SplatCsts array so that we favor 'vsplti -1' for
// cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
int i = SplatCsts[idx];
// Figure out what shift amount will be used by altivec if shifted by i in
// this splat size.
unsigned TypeShiftAmt = i & (SplatBitSize-1);
// vsplti + shl self.
if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
static const unsigned IIDs[] = { // Intrinsic to use for each size.
Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
// vsplti + srl self.
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
static const unsigned IIDs[] = { // Intrinsic to use for each size.
Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
// vsplti + sra self.
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
static const unsigned IIDs[] = { // Intrinsic to use for each size.
Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
// vsplti + rol self.
if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
static const unsigned IIDs[] = { // Intrinsic to use for each size.
Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
// t = vsplti c, result = vsldoi t, t, 1
if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
// t = vsplti c, result = vsldoi t, t, 2
if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
// t = vsplti c, result = vsldoi t, t, 3
if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
return SDValue();
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
enum {
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
if (OpNum == OP_COPY) {
if (LHSID == (1*9+2)*9+3) return LHS;
assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
return RHS;
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
int ShufIdxs[16];
switch (OpNum) {
default: llvm_unreachable("Unknown i32 permute!");
ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
for (unsigned i = 0; i != 16; ++i)
ShufIdxs[i] = (i&3)+0;
for (unsigned i = 0; i != 16; ++i)
ShufIdxs[i] = (i&3)+4;
for (unsigned i = 0; i != 16; ++i)
ShufIdxs[i] = (i&3)+8;
for (unsigned i = 0; i != 16; ++i)
ShufIdxs[i] = (i&3)+12;
case OP_VSLDOI4:
return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
case OP_VSLDOI8:
return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
case OP_VSLDOI12:
return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
EVT VT = OpLHS.getValueType();
OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
return DAG.getNode(ISD::BITCAST, dl, VT, T);
/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
/// SDValue.
SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
SelectionDAG &DAG) const {
const unsigned BytesInVector = 16;
bool IsLE = Subtarget.isLittleEndian();
SDLoc dl(N);
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
unsigned ShiftElts = 0, InsertAtByte = 0;
bool Swap = false;
// Shifts required to get the byte we want at element 7.
unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
0, 15, 14, 13, 12, 11, 10, 9};
unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
1, 2, 3, 4, 5, 6, 7, 8};
ArrayRef<int> Mask = N->getMask();
int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
// For each mask element, find out if we're just inserting something
// from V2 into V1 or vice versa.
// Possible permutations inserting an element from V2 into V1:
// X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
// 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
// ...
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
// Inserting from V1 into V2 will be similar, except mask range will be
// [16,31].
bool FoundCandidate = false;
// If both vector operands for the shuffle are the same vector, the mask
// will contain only elements from the first one and the second one will be
// undef.
unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
// Go through the mask of half-words to find an element that's being moved
// from one vector to the other.
for (unsigned i = 0; i < BytesInVector; ++i) {
unsigned CurrentElement = Mask[i];
// If 2nd operand is undefined, we should only look for element 7 in the
// Mask.
if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
bool OtherElementsInOrder = true;
// Examine the other elements in the Mask to see if they're in original
// order.
for (unsigned j = 0; j < BytesInVector; ++j) {
if (j == i)
// If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
// from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
// in which we always assume we're always picking from the 1st operand.
int MaskOffset =
(!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
if (Mask[j] != OriginalOrder[j] + MaskOffset) {
OtherElementsInOrder = false;
// If other elements are in original order, we record the number of shifts
// we need to get the element we want into element 7. Also record which byte
// in the vector we should insert into.
if (OtherElementsInOrder) {
// If 2nd operand is undefined, we assume no shifts and no swapping.
if (V2.isUndef()) {
ShiftElts = 0;
Swap = false;
} else {
// Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
: BigEndianShifts[CurrentElement & 0xF];
Swap = CurrentElement < BytesInVector;
InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
FoundCandidate = true;
if (!FoundCandidate)
return SDValue();
// Candidate found, construct the proper SDAG sequence with VINSERTB,
// optionally with VECSHL if shift is required.
if (Swap)
std::swap(V1, V2);
if (V2.isUndef())
V2 = V1;
if (ShiftElts) {
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
DAG.getConstant(ShiftElts, dl, MVT::i32));
return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
/// SDValue.
SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
SelectionDAG &DAG) const {
const unsigned NumHalfWords = 8;
const unsigned BytesInVector = NumHalfWords * 2;
// Check that the shuffle is on half-words.
if (!isNByteElemShuffleMask(N, 2, 1))
return SDValue();
bool IsLE = Subtarget.isLittleEndian();
SDLoc dl(N);
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
unsigned ShiftElts = 0, InsertAtByte = 0;
bool Swap = false;
// Shifts required to get the half-word we want at element 3.
unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
uint32_t Mask = 0;
uint32_t OriginalOrderLow = 0x1234567;
uint32_t OriginalOrderHigh = 0x89ABCDEF;
// Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
// 32-bit space, only need 4-bit nibbles per element.
for (unsigned i = 0; i < NumHalfWords; ++i) {
unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
// For each mask element, find out if we're just inserting something
// from V2 into V1 or vice versa. Possible permutations inserting an element
// from V2 into V1:
// X, 1, 2, 3, 4, 5, 6, 7
// 0, X, 2, 3, 4, 5, 6, 7
// 0, 1, X, 3, 4, 5, 6, 7
// 0, 1, 2, X, 4, 5, 6, 7
// 0, 1, 2, 3, X, 5, 6, 7
// 0, 1, 2, 3, 4, X, 6, 7
// 0, 1, 2, 3, 4, 5, X, 7
// 0, 1, 2, 3, 4, 5, 6, X
// Inserting from V1 into V2 will be similar, except mask range will be [8,15].
bool FoundCandidate = false;
// Go through the mask of half-words to find an element that's being moved
// from one vector to the other.
for (unsigned i = 0; i < NumHalfWords; ++i) {
unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
uint32_t MaskOtherElts = ~(0xF << MaskShift);
uint32_t TargetOrder = 0x0;
// If both vector operands for the shuffle are the same vector, the mask
// will contain only elements from the first one and the second one will be
// undef.
if (V2.isUndef()) {
ShiftElts = 0;
unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
TargetOrder = OriginalOrderLow;
Swap = false;
// Skip if not the correct element or mask of other elements don't equal
// to our expected order.
if (MaskOneElt == VINSERTHSrcElem &&
(Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
FoundCandidate = true;
} else { // If both operands are defined.
// Target order is [8,15] if the current mask is between [0,7].
TargetOrder =
(MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
// Skip if mask of other elements don't equal our expected order.
if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
// We only need the last 3 bits for the number of shifts.
ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
: BigEndianShifts[MaskOneElt & 0x7];
InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
Swap = MaskOneElt < NumHalfWords;
FoundCandidate = true;
if (!FoundCandidate)
return SDValue();
// Candidate found, construct the proper SDAG sequence with VINSERTH,
// optionally with VECSHL if shift is required.
if (Swap)
std::swap(V1, V2);
if (V2.isUndef())
V2 = V1;
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
if (ShiftElts) {
// Double ShiftElts because we're left shifting on v16i8 type.
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
/// return the default SDValue.
SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG) const {
// The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
// to v16i8. Peek through the bitcasts to get the actual operands.
SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));
SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));
auto ShuffleMask = SVN->getMask();
SDValue VecShuffle(SVN, 0);
// Check that we have a four byte shuffle.
if (!isNByteElemShuffleMask(SVN, 4, 1))
return SDValue();
// Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
std::swap(LHS, RHS);
VecShuffle = DAG.getCommutedVectorShuffle(*SVN);
ShuffleMask = cast<ShuffleVectorSDNode>(VecShuffle)->getMask();
// Ensure that the RHS is a vector of constants.
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
if (!BVN)
return SDValue();
// Check if RHS is a splat of 4-bytes (or smaller).
APInt APSplatValue, APSplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,
HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
SplatBitSize > 32)
return SDValue();
// Check that the shuffle mask matches the semantics of XXSPLTI32DX.
// The instruction splats a constant C into two words of the source vector
// producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
// Thus we check that the shuffle mask is the equivalent of
// <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
// Note: the check above of isNByteElemShuffleMask() ensures that the bytes
// within each word are consecutive, so we only need to check the first byte.
SDValue Index;
bool IsLE = Subtarget.isLittleEndian();
if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
(ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);
else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
(ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);
return SDValue();
// If the splat is narrower than 32-bits, we need to get the 32-bit value
// for XXSPLTI32DX.
unsigned SplatVal = APSplatValue.getZExtValue();
for (; SplatBitSize < 32; SplatBitSize <<= 1)
SplatVal |= (SplatVal << SplatBitSize);
SDValue SplatNode = DAG.getNode(
PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),
Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));
return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);
/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
/// i.e (or (shl x, C1), (srl x, 128-C1)).
SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
assert(Op.getValueType() == MVT::v1i128 &&
"Only set v1i128 as custom, other type shouldn't reach here!");
SDLoc dl(Op);
SDValue N0 = peekThroughBitcasts(Op.getOperand(0));
SDValue N1 = peekThroughBitcasts(Op.getOperand(1));
unsigned SHLAmt = N1.getConstantOperandVal(0);
if (SHLAmt % 8 == 0) {
SmallVector<int, 16> Mask(16, 0);
std::iota(Mask.begin(), Mask.end(), 0);
std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());
if (SDValue Shuffle =
DAG.getVectorShuffle(MVT::v16i8, dl, DAG.getBitcast(MVT::v16i8, N0),
DAG.getUNDEF(MVT::v16i8), Mask))
return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, Shuffle);
SDValue ArgVal = DAG.getBitcast(MVT::i128, N0);
SDValue SHLOp = DAG.getNode(ISD::SHL, dl, MVT::i128, ArgVal,
DAG.getConstant(SHLAmt, dl, MVT::i32));
SDValue SRLOp = DAG.getNode(ISD::SRL, dl, MVT::i128, ArgVal,
DAG.getConstant(128 - SHLAmt, dl, MVT::i32));
SDValue OROp = DAG.getNode(ISD::OR, dl, MVT::i128, SHLOp, SRLOp);
return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, OROp);
/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
/// is a shuffle we can handle in a single instruction, return it. Otherwise,
/// return the code it can be lowered into. Worst case, it can always be
/// lowered into a vperm.
SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
// Any nodes that were combined in the target-independent combiner prior
// to vector legalization will not be sent to the target combine. Try to
// combine it here.
if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) {
if (!isa<ShuffleVectorSDNode>(NewShuffle))
return NewShuffle;
Op = NewShuffle;
SVOp = cast<ShuffleVectorSDNode>(Op);
V1 = Op.getOperand(0);
V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned ShiftElts, InsertAtByte;
bool Swap = false;
// If this is a load-and-splat, we can do that with a single instruction
// in some cases. However if the load has multiple uses, we don't want to
// combine it because that will just produce multiple loads.
- const SDValue *InputLoad = getNormalLoadInput(V1);
+ bool IsPermutedLoad = false;
+ const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
(PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
InputLoad->hasOneUse()) {
bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
int SplatIdx =
PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
+ // The splat index for permuted loads will be in the left half of the vector
+ // which is strictly wider than the loaded value by 8 bytes. So we need to
+ // adjust the splat index to point to the correct address in memory.
+ if (IsPermutedLoad) {
+ assert(isLittleEndian && "Unexpected permuted load on big endian target");
+ SplatIdx += IsFourByte ? 2 : 1;
+ assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
+ "Splat of a value outside of the loaded memory");
+ }
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
// For 4-byte load-and-splat, we need Power9.
if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
uint64_t Offset = 0;
if (IsFourByte)
Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
- // If we are loading a partial vector, it does not make sense to adjust
- // the base pointer. This happens with (splat (s_to_v_permuted (ld))).
- if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
- Offset = 0;
SDValue BasePtr = LD->getBasePtr();
if (Offset != 0)
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
BasePtr, DAG.getIntPtrConstant(Offset, dl));
SDValue Ops[] = {
LD->getChain(), // Chain
BasePtr, // BasePtr
DAG.getValueType(Op.getValueType()) // VT
DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
SDValue LdSplt =
DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
Ops, LD->getMemoryVT(), LD->getMemOperand());
if (LdSplt.getValueType() != SVOp->getValueType(0))
LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
return LdSplt;
if (Subtarget.hasP9Vector() &&
PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
isLittleEndian)) {
if (Swap)
std::swap(V1, V2);
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
if (ShiftElts) {
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
DAG.getConstant(ShiftElts, dl, MVT::i32));
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
if (Subtarget.hasPrefixInstrs()) {
SDValue SplatInsertNode;
if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
return SplatInsertNode;
if (Subtarget.hasP9Altivec()) {
SDValue NewISDNode;
if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
return NewISDNode;
if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
return NewISDNode;
if (Subtarget.hasVSX() &&
PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
if (Swap)
std::swap(V1, V2);
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
SDValue Conv2 =
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
DAG.getConstant(ShiftElts, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
if (Subtarget.hasVSX() &&
PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
if (Swap)
std::swap(V1, V2);
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
SDValue Conv2 =
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);
SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
DAG.getConstant(ShiftElts, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
if (Subtarget.hasP9Vector()) {
if (PPC::isXXBRHShuffleMask(SVOp)) {
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
SDValue ReveHWord = DAG.getNode(ISD::BSWAP, dl, MVT::v8i16, Conv);
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
} else if (PPC::isXXBRWShuffleMask(SVOp)) {
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
SDValue ReveWord = DAG.getNode(ISD::BSWAP, dl, MVT::v4i32, Conv);
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
} else if (PPC::isXXBRDShuffleMask(SVOp)) {
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
SDValue ReveDWord = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Conv);
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
} else if (PPC::isXXBRQShuffleMask(SVOp)) {
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
SDValue ReveQWord = DAG.getNode(ISD::BSWAP, dl, MVT::v1i128, Conv);
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
if (Subtarget.hasVSX()) {
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
DAG.getConstant(SplatIdx, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
// Left shifts of 8 bytes are actually swaps. Convert accordingly.
if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
if (Subtarget.hasQPX()) {
if (VT.getVectorNumElements() != 4)
return SDValue();
if (V2.isUndef()) V2 = V1;
int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
if (AlignIdx != -1) {
return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
DAG.getConstant(AlignIdx, dl, MVT::i32));
} else if (SVOp->isSplat()) {
int SplatIdx = SVOp->getSplatIndex();
if (SplatIdx >= 4) {
std::swap(V1, V2);
SplatIdx -= 4;
return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
DAG.getConstant(SplatIdx, dl, MVT::i32));
// Lower this into a qvgpci/qvfperm pair.
// Compute the qvgpci literal
unsigned idx = 0;
for (unsigned i = 0; i < 4; ++i) {
int m = SVOp->getMaskElt(i);
unsigned mm = m >= 0 ? (unsigned) m : i;
idx |= mm << (3-i)*3;
SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
DAG.getConstant(idx, dl, MVT::i32));
return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
// Cases that are handled by instructions that take permute immediates
// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
// selected by the instruction selector.
if (V2.isUndef()) {
if (PPC::isSplatShuffleMask(SVOp, 1) ||
PPC::isSplatShuffleMask(SVOp, 2) ||
PPC::isSplatShuffleMask(SVOp, 4) ||
PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) ||
(Subtarget.hasP8Altivec() && (
PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) ||
PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) ||
PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
return Op;
// Altivec has a variety of "shuffle immediates" that take two vector inputs
// and produce a fixed permutation. If any of these match, do not lower to
unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
(Subtarget.hasP8Altivec() && (
PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) ||
PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) ||
PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
return Op;
// Check to see if this is a shuffle of 4-byte values. If so, we can use our
// perfect shuffle table to emit an optimal matching sequence.
ArrayRef<int> PermMask = SVOp->getMask();
unsigned PFIndexes[4];
bool isFourElementShuffle = true;
for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
unsigned EltNo = 8; // Start out undef.
for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
if (PermMask[i*4+j] < 0)
continue; // Undef, ignore it.
unsigned ByteSource = PermMask[i*4+j];
if ((ByteSource & 3) != j) {
isFourElementShuffle = false;
if (EltNo == 8) {
EltNo = ByteSource/4;
} else if (EltNo != ByteSource/4) {
isFourElementShuffle = false;
PFIndexes[i] = EltNo;
// If this shuffle can be expressed as a shuffle of 4-byte elements, use the
// perfect shuffle vector to determine if it is cost effective to do this as
// discrete instructions, or whether we should use a vperm.
// For now, we skip this for little endian until such time as we have a
// little-endian perfect shuffle table.
if (isFourElementShuffle && !isLittleEndian) {
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex =
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
// Determining when to avoid vperm is tricky. Many things affect the cost
// of vperm, particularly how many times the perm mask needs to be computed.
// For example, if the perm mask can be hoisted out of a loop or is already
// used (perhaps because there are multiple permutes with the same shuffle
// mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of
// the loop requires an extra register.
// As a compromise, we only emit discrete instructions if the shuffle can be
// generated in 3 or fewer operations. When we have loop information
// available, if this block is within a loop, we should avoid using vperm
// for 3-operation perms and use a constant pool load instead.
if (Cost < 3)
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
// vector that will get spilled to the constant pool.
if (V2.isUndef()) V2 = V1;
// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
// that it is in input element units, not in bytes. Convert now.
// For little endian, the order of the input vectors is reversed, and
// the permutation mask is complemented with respect to 31. This is
// necessary to produce proper semantics with the big-endian-biased vperm
// instruction.
EVT EltVT = V1.getValueType().getVectorElementType();
unsigned BytesPerElement = EltVT.getSizeInBits()/8;
SmallVector<SDValue, 16> ResultMask;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
for (unsigned j = 0; j != BytesPerElement; ++j)
if (isLittleEndian)
ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j),
dl, MVT::i32));
ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl,
SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
LLVM_DEBUG(dbgs() << "Emitting a VPERM for the following shuffle:\n");
LLVM_DEBUG(dbgs() << "With the following permute control vector:\n");
if (isLittleEndian)
return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
V2, V1, VPermMask);
return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
V1, V2, VPermMask);
/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
/// vector comparison. If it is, return true and fill in Opc/isDot with
/// information about the intrinsic.
static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
bool &isDot, const PPCSubtarget &Subtarget) {
unsigned IntrinsicID =
CompareOpc = -1;
isDot = false;
switch (IntrinsicID) {
return false;
// Comparison predicates.
case Intrinsic::ppc_altivec_vcmpbfp_p:
CompareOpc = 966;
isDot = true;
case Intrinsic::ppc_altivec_vcmpeqfp_p:
CompareOpc = 198;
isDot = true;
case Intrinsic::ppc_altivec_vcmpequb_p:
CompareOpc = 6;
isDot = true;
case Intrinsic::ppc_altivec_vcmpequh_p:
CompareOpc = 70;
isDot = true;
case Intrinsic::ppc_altivec_vcmpequw_p:
CompareOpc = 134;
isDot = true;
case Intrinsic::ppc_altivec_vcmpequd_p:
if (Subtarget.hasP8Altivec()) {
CompareOpc = 199;
isDot = true;
} else
return false;
case Intrinsic::ppc_altivec_vcmpneb_p:
case Intrinsic::ppc_altivec_vcmpneh_p:
case Intrinsic::ppc_altivec_vcmpnew_p:
case Intrinsic::ppc_altivec_vcmpnezb_p:
case Intrinsic::ppc_altivec_vcmpnezh_p:
case Intrinsic::ppc_altivec_vcmpnezw_p:
if (Subtarget.hasP9Altivec()) {
switch (IntrinsicID) {
llvm_unreachable("Unknown comparison intrinsic.");
case Intrinsic::ppc_altivec_vcmpneb_p:
CompareOpc = 7;
case Intrinsic::ppc_altivec_vcmpneh_p:
CompareOpc = 71;
case Intrinsic::ppc_altivec_vcmpnew_p:
CompareOpc = 135;
case Intrinsic::ppc_altivec_vcmpnezb_p:
CompareOpc = 263;
case Intrinsic::ppc_altivec_vcmpnezh_p:
CompareOpc = 327;
case Intrinsic::ppc_altivec_vcmpnezw_p:
CompareOpc = 391;
isDot = true;
} else
return false;
case Intrinsic::ppc_altivec_vcmpgefp_p:
CompareOpc = 454;
isDot = true;
case Intrinsic::ppc_altivec_vcmpgtfp_p:
CompareOpc = 710;
isDot = true;
case Intrinsic::ppc_altivec_vcmpgtsb_p:
CompareOpc = 774;
isDot = true;
case Intrinsic::ppc_altivec_vcmpgtsh_p:
CompareOpc = 838;
isDot = true;
case Intrinsic::ppc_altivec_vcmpgtsw_p:
CompareOpc = 902;
isDot = true;
case Intrinsic::ppc_altivec_vcmpgtsd_p:
if (Subtarget.hasP8Altivec()) {
CompareOpc = 967;
isDot = true;
} else
return false;
case Intrinsic::ppc_altivec_vcmpgtub_p:
CompareOpc = 518;
isDot = true;
case Intrinsic::ppc_altivec_vcmpgtuh_p:
CompareOpc = 582;
isDot = true;
case Intrinsic::ppc_altivec_vcmpgtuw_p:
CompareOpc = 646;
isDot = true;
case Intrinsic::ppc_altivec_vcmpgtud_p:
if (Subtarget.hasP8Altivec()) {
CompareOpc = 711;
isDot = true;
} else
return false;
// VSX predicate comparisons use the same infrastructure
case Intrinsic::ppc_vsx_xvcmpeqdp_p:
case Intrinsic::ppc_vsx_xvcmpgedp_p:
case Intrinsic::ppc_vsx_xvcmpgtdp_p:
case Intrinsic::ppc_vsx_xvcmpeqsp_p:
case Intrinsic::ppc_vsx_xvcmpgesp_p:
case Intrinsic::ppc_vsx_xvcmpgtsp_p:
if (Subtarget.hasVSX()) {
switch (IntrinsicID) {
case Intrinsic::ppc_vsx_xvcmpeqdp_p:
CompareOpc = 99;
case Intrinsic::ppc_vsx_xvcmpgedp_p:
CompareOpc = 115;
case Intrinsic::ppc_vsx_xvcmpgtdp_p:
CompareOpc = 107;
case Intrinsic::ppc_vsx_xvcmpeqsp_p:
CompareOpc = 67;
case Intrinsic::ppc_vsx_xvcmpgesp_p:
CompareOpc = 83;
case Intrinsic::ppc_vsx_xvcmpgtsp_p:
CompareOpc = 75;
isDot = true;
} else
return false;
// Normal Comparisons.
case Intrinsic::ppc_altivec_vcmpbfp:
CompareOpc = 966;
case Intrinsic::ppc_altivec_vcmpeqfp:
CompareOpc = 198;
case Intrinsic::ppc_altivec_vcmpequb:
CompareOpc = 6;
case Intrinsic::ppc_altivec_vcmpequh:
CompareOpc = 70;
case Intrinsic::ppc_altivec_vcmpequw:
CompareOpc = 134;
case Intrinsic::ppc_altivec_vcmpequd:
if (Subtarget.hasP8Altivec())
CompareOpc = 199;
return false;
case Intrinsic::ppc_altivec_vcmpneb:
case Intrinsic::ppc_altivec_vcmpneh:
case Intrinsic::ppc_altivec_vcmpnew:
case Intrinsic::ppc_altivec_vcmpnezb:
case Intrinsic::ppc_altivec_vcmpnezh:
case Intrinsic::ppc_altivec_vcmpnezw:
if (Subtarget.hasP9Altivec())
switch (IntrinsicID) {
llvm_unreachable("Unknown comparison intrinsic.");
case Intrinsic::ppc_altivec_vcmpneb:
CompareOpc = 7;
case Intrinsic::ppc_altivec_vcmpneh:
CompareOpc = 71;
case Intrinsic::ppc_altivec_vcmpnew:
CompareOpc = 135;
case Intrinsic::ppc_altivec_vcmpnezb:
CompareOpc = 263;
case Intrinsic::ppc_altivec_vcmpnezh:
CompareOpc = 327;
case Intrinsic::ppc_altivec_vcmpnezw:
CompareOpc = 391;
return false;
case Intrinsic::ppc_altivec_vcmpgefp:
CompareOpc = 454;
case Intrinsic::ppc_altivec_vcmpgtfp:
CompareOpc = 710;
case Intrinsic::ppc_altivec_vcmpgtsb:
CompareOpc = 774;
case Intrinsic::ppc_altivec_vcmpgtsh:
CompareOpc = 838;
case Intrinsic::ppc_altivec_vcmpgtsw:
CompareOpc = 902;
case Intrinsic::ppc_altivec_vcmpgtsd:
if (Subtarget.hasP8Altivec())
CompareOpc = 967;
return false;
case Intrinsic::ppc_altivec_vcmpgtub:
CompareOpc = 518;
case Intrinsic::ppc_altivec_vcmpgtuh:
CompareOpc = 582;
case Intrinsic::ppc_altivec_vcmpgtuw:
CompareOpc = 646;
case Intrinsic::ppc_altivec_vcmpgtud:
if (Subtarget.hasP8Altivec())
CompareOpc = 711;
return false;
return true;
/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
/// lower, do it, otherwise return null.
SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrinsicID =
SDLoc dl(Op);
if (IntrinsicID == Intrinsic::thread_pointer) {
// Reads the thread pointer register, used for __builtin_thread_pointer.
if (Subtarget.isPPC64())
return DAG.getRegister(PPC::X13, MVT::i64);
return DAG.getRegister(PPC::R2, MVT::i32);
// If this is a lowered altivec predicate compare, CompareOpc is set to the
// opcode number of the comparison.
int CompareOpc;
bool isDot;
if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
return SDValue(); // Don't custom lower most intrinsics.
// If this is a non-dot comparison, make the VCMP node and we are done.
if (!isDot) {
SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
Op.getOperand(1), Op.getOperand(2),
DAG.getConstant(CompareOpc, dl, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
// Create the PPCISD altivec 'dot' comparison node.
SDValue Ops[] = {
Op.getOperand(2), // LHS
Op.getOperand(3), // RHS
DAG.getConstant(CompareOpc, dl, MVT::i32)
EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
// Now that we have the comparison, emit a copy from the CR to a GPR.
// This is flagged to the above dot comparison.
SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
DAG.getRegister(PPC::CR6, MVT::i32),
// Unpack the result based on how the target uses it.
unsigned BitNo; // Bit # of CR6.
bool InvertBit; // Invert result?
switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
default: // Can't happen, don't crash on invalid number though.
case 0: // Return the value of the EQ bit of CR6.
BitNo = 0; InvertBit = false;
case 1: // Return the inverted value of the EQ bit of CR6.
BitNo = 0; InvertBit = true;
case 2: // Return the value of the LT bit of CR6.
BitNo = 2; InvertBit = false;
case 3: // Return the inverted value of the LT bit of CR6.
BitNo = 2; InvertBit = true;
// Shift the bit into the low position.
Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
// Isolate the bit.
Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
DAG.getConstant(1, dl, MVT::i32));
// If we are supposed to, toggle the bit.
if (InvertBit)
Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
DAG.getConstant(1, dl, MVT::i32));
return Flags;
SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {
// SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
// the beginning of the argument list.
int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
SDLoc DL(Op);
switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
case Intrinsic::ppc_cfence: {
assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
Op.getOperand(ArgStart + 1)),
return SDValue();
// Lower scalar BSWAP64 to xxbrd.
SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
Op = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Op);
int VectorIndex = 0;
if (Subtarget.isLittleEndian())
VectorIndex = 1;
Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
return Op;
// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
// compared to a value that is atomically loaded (atomic loads zero-extend).
SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
"Expecting an atomic compare-and-swap here.");
SDLoc dl(Op);
auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
EVT MemVT = AtomicNode->getMemoryVT();
if (MemVT.getSizeInBits() >= 32)
return Op;
SDValue CmpOp = Op.getOperand(2);
// If this is already correctly zero-extended, leave it alone.
auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
if (DAG.MaskedValueIsZero(CmpOp, HighBits))
return Op;
// Clear the high bits of the compare operand.
unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
SDValue NewCmpOp =
DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
DAG.getConstant(MaskVal, dl, MVT::i32));
// Replace the existing compare operand with the properly zero-extended one.
SmallVector<SDValue, 4> Ops;
for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
Ops[2] = NewCmpOp;
MachineMemOperand *MMO = AtomicNode->getMemOperand();
SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
auto NodeTy =
return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
// Create a stack slot that is 16-byte aligned.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
// Store the input value into Value#0 of the stack slot.
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
// Load it out.
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
"Should only be called for ISD::INSERT_VECTOR_ELT");
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
// We have legal lowering for constant indices but not for variable ones.
if (!C)
return SDValue();
EVT VT = Op.getValueType();
SDLoc dl(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
if (VT == MVT::v8i16 || VT == MVT::v16i8) {
SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
unsigned InsertAtElement = C->getZExtValue();
unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
if (Subtarget.isLittleEndian()) {
InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
DAG.getConstant(InsertAtByte, dl, MVT::i32));
return Op;
SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
SDNode *N = Op.getNode();
assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
"Unknown extract_vector_elt type");
SDValue Value = N->getOperand(0);
// The first part of this is like the store lowering except that we don't
// need to track the chain.
// The values are now known to be -1 (false) or 1 (true). To convert this
// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
// This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
// understand how to form the extending load.
SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
// Now convert to an integer and store.
Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue StoreChain = DAG.getEntryNode();
SDValue Ops[] = {StoreChain,
DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
Value, FIdx};
SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
dl, VTs, Ops, MVT::v4i32, PtrInfo);
// Extract the value requested.
unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
SDValue IntVal =
DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));
if (!Subtarget.useCRBits())
return IntVal;
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
/// Lowering for QPX v4i1 loads
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
SDValue LoadChain = LN->getChain();
SDValue BasePtr = LN->getBasePtr();
if (Op.getValueType() == MVT::v4f64 ||
Op.getValueType() == MVT::v4f32) {
EVT MemVT = LN->getMemoryVT();
unsigned Alignment = LN->getAlignment();
// If this load is properly aligned, then it is legal.
if (Alignment >= MemVT.getStoreSize())
return Op;
EVT ScalarVT = Op.getValueType().getScalarType(),
ScalarMemVT = MemVT.getScalarType();
unsigned Stride = ScalarMemVT.getStoreSize();
SDValue Vals[4], LoadChains[4];
for (unsigned Idx = 0; Idx < 4; ++Idx) {
SDValue Load;
if (ScalarVT != ScalarMemVT)
Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
LN->getPointerInfo().getWithOffset(Idx * Stride),
ScalarMemVT, MinAlign(Alignment, Idx * Stride),
LN->getMemOperand()->getFlags(), LN->getAAInfo());
Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
LN->getPointerInfo().getWithOffset(Idx * Stride),
MinAlign(Alignment, Idx * Stride),
LN->getMemOperand()->getFlags(), LN->getAAInfo());
if (Idx == 0 && LN->isIndexed()) {
assert(LN->getAddressingMode() == ISD::PRE_INC &&
"Unknown addressing mode on vector load");
Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
Vals[Idx] = Load;
LoadChains[Idx] = Load.getValue(1);
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
DAG.getConstant(Stride, dl,
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);
if (LN->isIndexed()) {
SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
return DAG.getMergeValues(RetOps, dl);
SDValue RetOps[] = { Value, TF };
return DAG.getMergeValues(RetOps, dl);
assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");
// To lower v4i1 from a byte array, we load the byte elements of the
// vector and then reuse the BUILD_VECTOR logic.
SDValue VectElmts[4], VectElmtChains[4];
for (unsigned i = 0; i < 4; ++i) {
SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
VectElmts[i] = DAG.getExtLoad(
ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
LN->getPointerInfo().getWithOffset(i), MVT::i8,
/* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
VectElmtChains[i] = VectElmts[i].getValue(1);
LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);
SDValue RVals[] = { Value, LoadChain };
return DAG.getMergeValues(RVals, dl);
/// Lowering for QPX v4i1 stores
SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
SDValue StoreChain = SN->getChain();
SDValue BasePtr = SN->getBasePtr();
SDValue Value = SN->getValue();
if (Value.getValueType() == MVT::v4f64 ||
Value.getValueType() == MVT::v4f32) {
EVT MemVT = SN->getMemoryVT();
unsigned Alignment = SN->getAlignment();
// If this store is properly aligned, then it is legal.
if (Alignment >= MemVT.getStoreSize())
return Op;
EVT ScalarVT = Value.getValueType().getScalarType(),
ScalarMemVT = MemVT.getScalarType();
unsigned Stride = ScalarMemVT.getStoreSize();
SDValue Stores[4];
for (unsigned Idx = 0; Idx < 4; ++Idx) {
SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
DAG.getVectorIdxConstant(Idx, dl));
SDValue Store;
if (ScalarVT != ScalarMemVT)
Store =
DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
SN->getPointerInfo().getWithOffset(Idx * Stride),
ScalarMemVT, MinAlign(Alignment, Idx * Stride),
SN->getMemOperand()->getFlags(), SN->getAAInfo());
Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
SN->getPointerInfo().getWithOffset(Idx * Stride),
MinAlign(Alignment, Idx * Stride),
SN->getMemOperand()->getFlags(), SN->getAAInfo());
if (Idx == 0 && SN->isIndexed()) {
assert(SN->getAddressingMode() == ISD::PRE_INC &&
"Unknown addressing mode on vector store");
Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
DAG.getConstant(Stride, dl,
Stores[Idx] = Store;
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
if (SN->isIndexed()) {
SDValue RetOps[] = { TF, Stores[0].getValue(1) };
return DAG.getMergeValues(RetOps, dl);
return TF;
assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");
// The values are now known to be -1 (false) or 1 (true). To convert this
// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
// This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
// understand how to form the extending load.
SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
// Now convert to an integer and store.
Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
MachinePointerInfo PtrInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
SDValue Ops[] = {StoreChain,
DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
Value, FIdx};
SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
dl, VTs, Ops, MVT::v4i32, PtrInfo);
// Move data into the byte array.
SDValue Loads[4], LoadChains[4];
for (unsigned i = 0; i < 4; ++i) {
unsigned Offset = 4*i;
SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
LoadChains[i] = Loads[i].getValue(1);
StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
SDValue Stores[4];
for (unsigned i = 0; i < 4; ++i) {
SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
Stores[i] = DAG.getTruncStore(
StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
return StoreChain;
SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
if (Op.getValueType() == MVT::v4i32) {
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);
// +16 as shift amt.
SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);
SDValue RHSSwap = // = vrlw RHS, 16
BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
// Shrinkify inputs to v8i16.
LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
// Low parts multiplied together, generating 32-bit results (we ignore the
// top parts).
SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
LHS, RHS, DAG, dl, MVT::v4i32);
SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
// Shift the high parts up 16 bits.
HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
Neg16, DAG, dl);
return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
} else if (Op.getValueType() == MVT::v16i8) {
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
bool isLittleEndian = Subtarget.isLittleEndian();
// Multiply the even 8-bit parts, producing 16-bit sums.
SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
LHS, RHS, DAG, dl, MVT::v8i16);
EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
// Multiply the odd 8-bit parts, producing 16-bit sums.
SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
LHS, RHS, DAG, dl, MVT::v8i16);
OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
// Merge the results together. Because vmuleub and vmuloub are
// instructions with a big-endian bias, we must reverse the
// element numbering and reverse the meaning of "odd" and "even"
// when generating little endian code.
int Ops[16];
for (unsigned i = 0; i != 8; ++i) {
if (isLittleEndian) {
Ops[i*2 ] = 2*i;
Ops[i*2+1] = 2*i+16;
} else {
Ops[i*2 ] = 2*i+1;
Ops[i*2+1] = 2*i+1+16;
if (isLittleEndian)
return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
} else {
llvm_unreachable("Unknown mul to lower!");
SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");
EVT VT = Op.getValueType();
assert(VT.isVector() &&
"Only set vector abs as custom, scalar abs shouldn't reach here!");
assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
VT == MVT::v16i8) &&
"Unexpected vector element type!");
assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) &&
"Current subtarget doesn't support smax v2i64!");
// For vector abs, it can be lowered to:
// abs x
// ==>
// y = -x
// smax(x, y)
SDLoc dl(Op);
SDValue X = Op.getOperand(0);
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);
// SMAX patch
// hasn't landed yet, so use intrinsic first here.
// TODO: Should use SMAX directly once SMAX patch landed
Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
if (VT == MVT::v2i64)
BifID = Intrinsic::ppc_altivec_vmaxsd;
else if (VT == MVT::v8i16)
BifID = Intrinsic::ppc_altivec_vmaxsh;
else if (VT == MVT::v16i8)
BifID = Intrinsic::ppc_altivec_vmaxsb;
return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
// Custom lowering for fpext vf32 to v2f64
SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::FP_EXTEND &&
"Should only be called for ISD::FP_EXTEND");
// FIXME: handle extends from half precision float vectors on P9.
// We only want to custom lower an extend from v2f32 to v2f64.
if (Op.getValueType() != MVT::v2f64 ||
Op.getOperand(0).getValueType() != MVT::v2f32)
return SDValue();
SDLoc dl(Op);
SDValue Op0 = Op.getOperand(0);
switch (Op0.getOpcode()) {
return SDValue();
assert(Op0.getNumOperands() == 2 &&
isa<ConstantSDNode>(Op0->getOperand(1)) &&
"Node should have 2 operands with second one being a constant!");
if (Op0.getOperand(0).getValueType() != MVT::v4f32)
return SDValue();
// Custom lower is only done for high or low doubleword.
int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
if (Idx % 2 != 0)
return SDValue();
// Since input is v4f32, at this point Idx is either 0 or 2.
// Shift to get the doubleword position we want.
int DWord = Idx >> 1;
// High and low word positions are different on little endian.
if (Subtarget.isLittleEndian())
DWord ^= 0x1;
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,
Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));
case ISD::FADD:
case ISD::FMUL:
case ISD::FSUB: {
SDValue NewLoad[2];
for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
// Ensure both input are loads.
SDValue LdOp = Op0.getOperand(i);
if (LdOp.getOpcode() != ISD::LOAD)
return SDValue();
// Generate new load node.
LoadSDNode *LD = cast<LoadSDNode>(LdOp);
SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
NewLoad[i] = DAG.getMemIntrinsicNode(
PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
LD->getMemoryVT(), LD->getMemOperand());
SDValue NewOp =
DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],
NewLoad[1], Op0.getNode()->getFlags());
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,
DAG.getConstant(0, dl, MVT::i32));
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(Op0);
SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
SDValue NewLd = DAG.getMemIntrinsicNode(
PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
LD->getMemoryVT(), LD->getMemOperand());
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,
DAG.getConstant(0, dl, MVT::i32));
llvm_unreachable("ERROR:Should return for all cases within swtich.");
/// LowerOperation - Provide custom lowering hooks for some operations.
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Wasn't expecting to be able to lower this!");
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
// Variable argument lowering.
case ISD::VASTART: return LowerVASTART(Op, DAG);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, DAG);
// Exception handling lowering.
case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::LOAD: return LowerLOAD(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
// Lower 64-bit shifts.
case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
// Vector-related lowering.
case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::ABS: return LowerABS(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::ROTL: return LowerROTL(Op, DAG);
// For counter-based loop handling.
case ISD::INTRINSIC_W_CHAIN: return SDValue();
case ISD::BITCAST: return LowerBITCAST(Op, DAG);
// Frame & Return address.
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
return LowerINTRINSIC_VOID(Op, DAG);
case ISD::BSWAP:
return LowerBSWAP(Op, DAG);
return LowerATOMIC_CMP_SWAP(Op, DAG);
void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
SelectionDAG &DAG) const {
SDLoc dl(N);
switch (N->getOpcode()) {
llvm_unreachable("Do not know how to custom type legalize this operation!");
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, RTB, RTB.getValue(1)));
if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
assert(N->getValueType(0) == MVT::i1 &&
"Unexpected result type for CTR decrement intrinsic");
EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
case ISD::VAARG: {
if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
EVT VT = N->getValueType(0);
if (VT == MVT::i64) {
SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);
// LowerFP_TO_INT() can only handle f32 and f64.
if (N->getOperand(0).getValueType() == MVT::ppcf128)
Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
EVT TrgVT = N->getValueType(0);
EVT OpVT = N->getOperand(0).getValueType();
if (TrgVT.isVector() &&
isOperationCustom(N->getOpcode(), TrgVT) &&
OpVT.getSizeInBits() <= 128 &&
Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
// Don't handle bitcast here.
SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG);
if (Lowered)
// Other Lowering Code
static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Function *Func = Intrinsic::getDeclaration(M, Id);
return Builder.CreateCall(Func, {});
// The mappings for emitLeading/TrailingFence is taken from
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
return nullptr;
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See and
// and for justification.
if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
return Builder.CreateCall(
Intrinsic::ppc_cfence, {Inst->getType()}),
// FIXME: Can use isync for rmw operation.
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
return nullptr;
MachineBasicBlock *
PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
unsigned AtomicSize,
unsigned BinOpcode,
unsigned CmpOpcode,
unsigned CmpPred) const {
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
auto LoadMnemonic = PPC::LDARX;
auto StoreMnemonic = PPC::STDCX;
switch (AtomicSize) {
llvm_unreachable("Unexpected size of atomic entity");
case 1:
LoadMnemonic = PPC::LBARX;
StoreMnemonic = PPC::STBCX;
assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
case 2:
LoadMnemonic = PPC::LHARX;
StoreMnemonic = PPC::STHCX;
assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
case 4:
LoadMnemonic = PPC::LWARX;
StoreMnemonic = PPC::STWCX;
case 8:
LoadMnemonic = PPC::LDARX;
StoreMnemonic = PPC::STDCX;
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction *F = BB->getParent();
MachineFunction::iterator It = ++BB->getIterator();
Register dest = MI.getOperand(0).getReg();
Register ptrA = MI.getOperand(1).getReg();
Register ptrB = MI.getOperand(2).getReg();
Register incr = MI.getOperand(3).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *loop2MBB =
CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, loopMBB);
if (CmpOpcode)
F->insert(It, loop2MBB);
F->insert(It, exitMBB);
exitMBB->splice(exitMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
MachineRegisterInfo &RegInfo = F->getRegInfo();
Register TmpReg = (!BinOpcode) ? incr :
RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
: &PPC::GPRCRegClass);
// thisMBB:
// ...
// fallthrough --> loopMBB
// loopMBB:
// l[wd]arx dest, ptr
// add r0, dest, incr
// st[wd]cx. r0, ptr
// bne- loopMBB
// fallthrough --> exitMBB
// For max/min...
// loopMBB:
// l[wd]arx dest, ptr
// cmpl?[wd] incr, dest
// bgt exitMBB
// loop2MBB:
// st[wd]cx. dest, ptr
// bne- loopMBB
// fallthrough --> exitMBB
BB = loopMBB;
BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
if (BinOpcode)
BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
if (CmpOpcode) {
// Signed comparisons of byte or halfword values must be sign-extended.
if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
} else
BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
BuildMI(BB, dl, TII->get(PPC::BCC))
BB = loop2MBB;
BuildMI(BB, dl, TII->get(StoreMnemonic))
BuildMI(BB, dl, TII->get(PPC::BCC))
// exitMBB:
// ...
BB = exitMBB;
return BB;
MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
MachineInstr &MI, MachineBasicBlock *BB,
bool is8bit, // operation
unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
// If we support part-word atomic mnemonics, just use them
if (Subtarget.hasPartwordAtomics())
return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
// In 64 bit mode we have to use 64 bits for addresses, even though the
// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
// registers without caring whether they're 32 or 64, but here we're
// doing actual arithmetic on the addresses.
bool is64bit = Subtarget.isPPC64();
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction *F = BB->getParent();
MachineFunction::iterator It = ++BB->getIterator();
Register dest = MI.getOperand(0).getReg();
Register ptrA = MI.getOperand(1).getReg();
Register ptrB = MI.getOperand(2).getReg();
Register incr = MI.getOperand(3).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *loop2MBB =
CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, loopMBB);
if (CmpOpcode)
F->insert(It, loop2MBB);
F->insert(It, exitMBB);
exitMBB->splice(exitMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
MachineRegisterInfo &RegInfo = F->getRegInfo();
const TargetRegisterClass *RC =
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
Register PtrReg = RegInfo.createVirtualRegister(RC);
Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
Register ShiftReg =
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);
Register MaskReg = RegInfo.createVirtualRegister(GPRC);
Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
Register Ptr1Reg;
Register TmpReg =
(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);
// thisMBB:
// ...
// fallthrough --> loopMBB
// The 4-byte load must be aligned, while a char or short may be
// anywhere in the word. Hence all this nasty bookkeeping code.
// add ptr1, ptrA, ptrB [copy if ptrA==0]
// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
// xori shift, shift1, 24 [16]
// rlwinm ptr, ptr1, 0, 0, 29
// slw incr2, incr, shift
// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
// slw mask, mask2, shift
// loopMBB:
// lwarx tmpDest, ptr
// add tmp, tmpDest, incr2
// andc tmp2, tmpDest, mask
// and tmp3, tmp, mask
// or tmp4, tmp3, tmp2
// stwcx. tmp4, ptr
// bne- loopMBB
// fallthrough --> exitMBB
// srw dest, tmpDest, shift
if (ptrA != ZeroReg) {
Ptr1Reg = RegInfo.createVirtualRegister(RC);
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
} else {
Ptr1Reg = ptrB;
// We need use 32-bit subregister to avoid mismatch register class in 64-bit
// mode.
BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
.addImm(is8bit ? 28 : 27);
if (!isLittleEndian)
BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
.addImm(is8bit ? 24 : 16);
if (is64bit)
BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg);
if (is8bit)
BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
else {
BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
BB = loopMBB;
BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
if (BinOpcode)
BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg);
if (CmpOpcode) {
// For unsigned comparisons, we can directly compare the shifted values.
// For signed comparisons we shift and sign extend.
Register SReg = RegInfo.createVirtualRegister(GPRC);
BuildMI(BB, dl, TII->get(PPC::AND), SReg)
unsigned ValueReg = SReg;
unsigned CmpReg = Incr2Reg;
if (CmpOpcode == PPC::CMPW) {
ValueReg = RegInfo.createVirtualRegister(GPRC);
BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
Register ValueSReg = RegInfo.createVirtualRegister(GPRC);
BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
ValueReg = ValueSReg;
CmpReg = incr;
BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
BuildMI(BB, dl, TII->get(PPC::BCC))
BB = loop2MBB;
BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg);
BuildMI(BB, dl, TII->get(PPC::STWCX))
BuildMI(BB, dl, TII->get(PPC::BCC))
// exitMBB:
// ...
BB = exitMBB;
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
return BB;
llvm::MachineBasicBlock *
PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
Register mainDstReg = MRI.createVirtualRegister(RC);
Register restoreDstReg = MRI.createVirtualRegister(RC);
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
"Invalid Pointer Size!");
// For v = setjmp(buf), we generate
// thisMBB:
// SjLjSetup mainMBB
// bl mainMBB
// v_restore = 1
// b sinkMBB
// mainMBB:
// buf[LabelOffset] = LR
// v_main = 0
// sinkMBB:
// v = phi(main, restore)
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, mainMBB);
MF->insert(I, sinkMBB);
MachineInstrBuilder MIB;
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
// Note that the structure of the jmp_buf used here is not compatible
// with that used by libc, and is not designed to be. Specifically, it
// stores only those 'reserved' registers that LLVM does not otherwise
// understand how to spill. Also, by convention, by the time this
// intrinsic is called, Clang has already stored the frame address in the
// first slot of the buffer and stack address in the third. Following the
// X86 target code, we'll store the jump address in the second slot. We also
// need to save the TOC pointer (R2) to handle jumps between shared
// libraries, and that will be stored in the fourth slot. The thread
// identifier (R13) is not affected.
// thisMBB:
const int64_t LabelOffset = 1 * PVT.getStoreSize();
const int64_t TOCOffset = 3 * PVT.getStoreSize();
const int64_t BPOffset = 4 * PVT.getStoreSize();
// Prepare IP either in reg.
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
Register LabelReg = MRI.createVirtualRegister(PtrRC);
Register BufReg = MI.getOperand(1).getReg();
if (Subtarget.is64BitELFABI()) {
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
// Naked functions never have a base pointer, and so we use r1. For all
// other functions, this decision must be delayed until during PEI.
unsigned BaseReg;
if (MF->getFunction().hasFnAttribute(Attribute::Naked))
BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
MIB = BuildMI(*thisMBB, MI, DL,
TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
// Setup
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
// mainMBB:
// mainDstReg = 0
BuildMI(mainMBB, DL,
TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
// Store IP
if (Subtarget.isPPC64()) {
MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
} else {
MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
// sinkMBB:
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
TII->get(PPC::PHI), DstReg)
return sinkMBB;
MachineBasicBlock *
PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
"Invalid Pointer Size!");
const TargetRegisterClass *RC =
(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
Register Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
unsigned BP =
(PVT == MVT::i64)
? PPC::X30
: (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
: PPC::R30);
MachineInstrBuilder MIB;
const int64_t LabelOffset = 1 * PVT.getStoreSize();
const int64_t SPOffset = 2 * PVT.getStoreSize();
const int64_t TOCOffset = 3 * PVT.getStoreSize();
const int64_t BPOffset = 4 * PVT.getStoreSize();
Register BufReg = MI.getOperand(0).getReg();
// Reload FP (the jumped-to function may not have had a
// frame pointer, and if so, then its r31 will be restored
// as necessary).
if (PVT == MVT::i64) {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
} else {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
// Reload IP
if (PVT == MVT::i64) {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
} else {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
// Reload SP
if (PVT == MVT::i64) {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
} else {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
// Reload BP
if (PVT == MVT::i64) {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
} else {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
// Reload TOC
if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
// Jump
BuildMI(*MBB, MI, DL,
TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
return MBB;
bool PPCTargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
// If the function specifically requests inline stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
return false;
unsigned PPCTargetLowering::getStackProbeSize(MachineFunction &MF) const {
const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
unsigned StackAlign = TFI->getStackAlignment();
assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
"Unexpected stack alignment");
// The default stack probe size is 4096 if the function has no
// stack-probe-size attribute.
unsigned StackProbeSize = 4096;
const Function &Fn = MF.getFunction();
if (Fn.hasFnAttribute("stack-probe-size"))
.getAsInteger(0, StackProbeSize);
// Round down to the stack alignment.
StackProbeSize &= ~(StackAlign - 1);
return StackProbeSize ? StackProbeSize : StackAlign;
// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
// into three phases. In the first phase, it uses pseudo instruction
// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
// FinalStackPtr. In the second phase, it generates a loop for probing blocks.
// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
// MaxCallFrameSize so that it can calculate correct data area pointer.
MachineBasicBlock *
PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
MachineBasicBlock *MBB) const {
const bool isPPC64 = Subtarget.isPPC64();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
const unsigned ProbeSize = getStackProbeSize(*MF);
const BasicBlock *ProbedBB = MBB->getBasicBlock();
MachineRegisterInfo &MRI = MF->getRegInfo();
// The CFG of probing stack looks as
// +-----+
// | MBB |
// +--+--+
// |
// +----v----+
// +--->+ TestMBB +---+
// | +----+----+ |
// | | |
// | +-----v----+ |
// +---+ BlockMBB | |
// +----------+ |
// |
// +---------+ |
// | TailMBB +<--+
// +---------+
// In MBB, calculate previous frame pointer and final stack pointer.
// In TestMBB, test if sp is equal to final stack pointer, if so, jump to
// TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
// TailMBB is spliced via \p MI.
MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB);
MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB);
MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB);
MachineFunction::iterator MBBIter = ++MBB->getIterator();
MF->insert(MBBIter, TestMBB);
MF->insert(MBBIter, BlockMBB);
MF->insert(MBBIter, TailMBB);
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
Register DstReg = MI.getOperand(0).getReg();
Register NegSizeReg = MI.getOperand(1).getReg();
Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
Register ActualNegSizeReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
// Since value of NegSizeReg might be realigned in prologepilog, insert a
// PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and
// NegSize.
unsigned ProbeOpc;
if (!MRI.hasOneNonDBGUse(NegSizeReg))
ProbeOpc =
// By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg
// and NegSizeReg will be allocated in the same phyreg to avoid
// redundant copy when NegSizeReg has only one use which is current MI and
// will be replaced by PREPARE_PROBED_ALLOCA then.
BuildMI(*MBB, {MI}, DL, TII->get(ProbeOpc), FramePointer)
// Calculate final stack pointer, which equals to SP + ActualNegSize.
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4),
// Materialize a scratch register for update.
int64_t NegProbeSize = -(int64_t)ProbeSize;
assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
if (!isInt<16>(NegProbeSize)) {
Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)
.addImm(NegProbeSize >> 16);
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
.addImm(NegProbeSize & 0xFFFF);
} else
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg)
// Probing leading residual part.
Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)
Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)
Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
// Remaining part should be multiple of ProbeSize.
Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass);
BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult)
BuildMI(TestMBB, DL, TII->get(PPC::BCC))
// Touch the block.
// |P...|P...|P...
BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB);
// Calculation of MaxCallFrameSize is deferred to prologepilog, use
// DYNAREAOFFSET pseudo instruction to get the future result.
Register MaxCallFrameSizeReg =
MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
BuildMI(TailMBB, DL,
BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg)
// Splice instructions after MI to TailMBB.
TailMBB->splice(TailMBB->end(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
// Delete the pseudo instruction.
return TailMBB;
MachineBasicBlock *
PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
if (MI.getOpcode() == TargetOpcode::STACKMAP ||
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
if (Subtarget.is64BitELFABI() &&
MI.getOpcode() == TargetOpcode::PATCHPOINT &&
!Subtarget.isUsingPCRelativeCalls()) {
// Call lowering should have added an r2 operand to indicate a dependence
// on the TOC base pointer value. It can't however, because there is no
// way to mark the dependence as implicit there, and so the stackmap code
// will confuse it with a regular operand. Instead, add the dependence
// here.
MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
return emitPatchPoint(MI, BB);
if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
return emitEHSjLjSetJmp(MI, BB);
} else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
return emitEHSjLjLongJmp(MI, BB);
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
// To "insert" these instructions we actually have to insert their
// control-flow patterns.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
MachineFunction *F = BB->getParent();
if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 ||
MI.getOpcode() == PPC::SELECT_I8) {
SmallVector<MachineOperand, 2> Cond;
if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
MI.getOpcode() == PPC::SELECT_CC_I8)
DebugLoc dl = MI.getDebugLoc();
TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
} else if (MI.getOpcode() == PPC::SELECT_CC_F4 ||
MI.getOpcode() == PPC::SELECT_CC_F8 ||
MI.getOpcode() == PPC::SELECT_CC_F16 ||
MI.getOpcode() == PPC::SELECT_CC_QFRC ||
MI.getOpcode() == PPC::SELECT_CC_QSRC ||
MI.getOpcode() == PPC::SELECT_CC_QBRC ||
MI.getOpcode() == PPC::SELECT_CC_VRRC ||
MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
MI.getOpcode() == PPC::SELECT_CC_VSRC ||
MI.getOpcode() == PPC::SELECT_CC_SPE4 ||
MI.getOpcode() == PPC::SELECT_CC_SPE ||
MI.getOpcode() == PPC::SELECT_F4 ||
MI.getOpcode() == PPC::SELECT_F8 ||
MI.getOpcode() == PPC::SELECT_F16 ||
MI.getOpcode() == PPC::SELECT_QFRC ||
MI.getOpcode() == PPC::SELECT_QSRC ||
MI.getOpcode() == PPC::SELECT_QBRC ||
MI.getOpcode() == PPC::SELECT_SPE ||
MI.getOpcode() == PPC::SELECT_SPE4 ||
MI.getOpcode() == PPC::SELECT_VRRC ||
MI.getOpcode() == PPC::SELECT_VSFRC ||
MI.getOpcode() == PPC::SELECT_VSSRC ||
MI.getOpcode() == PPC::SELECT_VSRC) {
// The incoming instruction knows the destination vreg to set, the
// condition code register to branch on, the true/false values to
// select between, and a branch opcode to use.
// thisMBB:
// ...
// TrueVal = ...
// cmpTY ccX, r1, r2
// bCC copy1MBB
// fallthrough --> copy0MBB
MachineBasicBlock *thisMBB = BB;
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
DebugLoc dl = MI.getDebugLoc();
F->insert(It, copy0MBB);
F->insert(It, sinkMBB);
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
// Next, add the true and fallthrough blocks as its successors.
if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
MI.getOpcode() == PPC::SELECT_F16 ||
MI.getOpcode() == PPC::SELECT_SPE4 ||
MI.getOpcode() == PPC::SELECT_SPE ||
MI.getOpcode() == PPC::SELECT_QFRC ||
MI.getOpcode() == PPC::SELECT_QSRC ||
MI.getOpcode() == PPC::SELECT_QBRC ||
MI.getOpcode() == PPC::SELECT_VRRC ||
MI.getOpcode() == PPC::SELECT_VSFRC ||
MI.getOpcode() == PPC::SELECT_VSSRC ||
MI.getOpcode() == PPC::SELECT_VSRC) {
BuildMI(BB, dl, TII->get(PPC::BC))
} else {
unsigned SelectPred = MI.getOperand(4).getImm();
BuildMI(BB, dl, TII->get(PPC::BCC))
// copy0MBB:
// %FalseValue = ...
// # fallthrough to sinkMBB
BB = copy0MBB;
// Update machine-CFG edges
// sinkMBB:
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
// ...
BB = sinkMBB;
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
} else if (MI.getOpcode() == PPC::ReadTB) {
// To read the 64-bit time-base register on a 32-bit target, we read the
// two halves. Should the counter have wrapped while it was being read, we
// need to try again.
// ...
// readLoop:
// mfspr Rx,TBU # load from TBU
// mfspr Ry,TB # load from TB
// mfspr Rz,TBU # load from TBU
// cmpw crX,Rx,Rz # check if 'old'='new'
// bne readLoop # branch if they're not equal
// ...
MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
DebugLoc dl = MI.getDebugLoc();
F->insert(It, readMBB);
F->insert(It, sinkMBB);
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
BB = readMBB;
MachineRegisterInfo &RegInfo = F->getRegInfo();
Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
Register LoReg = MI.getOperand(0).getReg();
Register HiReg = MI.getOperand(1).getReg();
BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
BuildMI(BB, dl, TII->get(PPC::BCC))
} else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE);
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE);
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
BB = EmitAtomicBinary(MI, BB, 4, 0);
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
BB = EmitAtomicBinary(MI, BB, 8, 0);
else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
(Subtarget.hasPartwordAtomics() &&
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
(Subtarget.hasPartwordAtomics() &&
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
auto LoadMnemonic = PPC::LDARX;
auto StoreMnemonic = PPC::STDCX;
switch (MI.getOpcode()) {
llvm_unreachable("Compare and swap of unknown size");
LoadMnemonic = PPC::LBARX;
StoreMnemonic = PPC::STBCX;
assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
LoadMnemonic = PPC::LHARX;
StoreMnemonic = PPC::STHCX;
assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
LoadMnemonic = PPC::LWARX;
StoreMnemonic = PPC::STWCX;
LoadMnemonic = PPC::LDARX;
StoreMnemonic = PPC::STDCX;
Register dest = MI.getOperand(0).getReg();
Register ptrA = MI.getOperand(1).getReg();
Register ptrB = MI.getOperand(2).getReg();
Register oldval = MI.getOperand(3).getReg();
Register newval = MI.getOperand(4).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, loop1MBB);
F->insert(It, loop2MBB);
F->insert(It, midMBB);
F->insert(It, exitMBB);
exitMBB->splice(exitMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
// thisMBB:
// ...
// fallthrough --> loopMBB
// loop1MBB:
// l[bhwd]arx dest, ptr
// cmp[wd] dest, oldval
// bne- midMBB
// loop2MBB:
// st[bhwd]cx. newval, ptr
// bne- loopMBB
// b exitBB
// midMBB:
// st[bhwd]cx. dest, ptr
// exitBB:
BB = loop1MBB;
BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB);
BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
BuildMI(BB, dl, TII->get(PPC::BCC))
BB = loop2MBB;
BuildMI(BB, dl, TII->get(StoreMnemonic))
BuildMI(BB, dl, TII->get(PPC::BCC))
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
BB = midMBB;
BuildMI(BB, dl, TII->get(StoreMnemonic))
// exitMBB:
// ...
BB = exitMBB;
} else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
// We must use 64-bit registers for addresses when targeting 64-bit,
// since we're actually doing arithmetic on them. Other registers
// can be 32-bit.
bool is64bit = Subtarget.isPPC64();
bool isLittleEndian = Subtarget.isLittleEndian();
bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
Register dest = MI.getOperand(0).getReg();
Register ptrA = MI.getOperand(1).getReg();
Register ptrB = MI.getOperand(2).getReg();
Register oldval = MI.getOperand(3).getReg();
Register newval = MI.getOperand(4).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, loop1MBB);
F->insert(It, loop2MBB);
F->insert(It, midMBB);
F->insert(It, exitMBB);
exitMBB->splice(exitMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
MachineRegisterInfo &RegInfo = F->getRegInfo();
const TargetRegisterClass *RC =
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
Register PtrReg = RegInfo.createVirtualRegister(RC);
Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
Register ShiftReg =
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
Register MaskReg = RegInfo.createVirtualRegister(GPRC);
Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
Register Ptr1Reg;
Register TmpReg = RegInfo.createVirtualRegister(GPRC);
Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
// thisMBB:
// ...
// fallthrough --> loopMBB
// The 4-byte load must be aligned, while a char or short may be
// anywhere in the word. Hence all this nasty bookkeeping code.
// add ptr1, ptrA, ptrB [copy if ptrA==0]
// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
// xori shift, shift1, 24 [16]
// rlwinm ptr, ptr1, 0, 0, 29
// slw newval2, newval, shift
// slw oldval2, oldval,shift
// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
// slw mask, mask2, shift
// and newval3, newval2, mask
// and oldval3, oldval2, mask
// loop1MBB:
// lwarx tmpDest, ptr
// and tmp, tmpDest, mask
// cmpw tmp, oldval3
// bne- midMBB
// loop2MBB:
// andc tmp2, tmpDest, mask
// or tmp4, tmp2, newval3
// stwcx. tmp4, ptr
// bne- loop1MBB
// b exitBB
// midMBB:
// stwcx. tmpDest, ptr
// exitBB:
// srw dest, tmpDest, shift
if (ptrA != ZeroReg) {
Ptr1Reg = RegInfo.createVirtualRegister(RC);
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
} else {
Ptr1Reg = ptrB;
// We need use 32-bit subregister to avoid mismatch register class in 64-bit
// mode.
BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
.addImm(is8bit ? 28 : 27);
if (!isLittleEndian)
BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
.addImm(is8bit ? 24 : 16);
if (is64bit)
BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
if (is8bit)
BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
else {
BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
BB = loop1MBB;
BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
BuildMI(BB, dl, TII->get(PPC::AND), TmpReg)
BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
BuildMI(BB, dl, TII->get(PPC::BCC))
BB = loop2MBB;
BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg)
BuildMI(BB, dl, TII->get(PPC::STWCX))
BuildMI(BB, dl, TII->get(PPC::BCC))
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
BB = midMBB;
BuildMI(BB, dl, TII->get(PPC::STWCX))
// exitMBB:
// ...
BB = exitMBB;
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
} else if (MI.getOpcode() == PPC::FADDrtz) {
// This pseudo performs an FADD with rounding mode temporarily forced
// to round-to-zero. We emit this via custom inserter since the FPSCR
// is not modeled at the SelectionDAG level.
Register Dest = MI.getOperand(0).getReg();
Register Src1 = MI.getOperand(1).getReg();
Register Src2 = MI.getOperand(2).getReg();
DebugLoc dl = MI.getDebugLoc();
MachineRegisterInfo &RegInfo = F->getRegInfo();
Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
// Save FPSCR value.
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
// Set rounding mode to round-to-zero.
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
// Perform addition.
BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
// Restore FPSCR value.
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
} else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT ||
MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {
unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
? PPC::ANDI8_rec
: PPC::ANDI_rec;
bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);
MachineRegisterInfo &RegInfo = F->getRegInfo();
Register Dest = RegInfo.createVirtualRegister(
Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
DebugLoc Dl = MI.getDebugLoc();
BuildMI(*BB, MI, Dl, TII->get(Opcode), Dest)
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
.addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT);
} else if (MI.getOpcode() == PPC::TCHECK_RET) {
DebugLoc Dl = MI.getDebugLoc();
MachineRegisterInfo &RegInfo = F->getRegInfo();
Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
} else if (MI.getOpcode() == PPC::TBEGIN_RET) {
DebugLoc Dl = MI.getDebugLoc();
unsigned Imm = MI.getOperand(1).getImm();
BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
} else if (MI.getOpcode() == PPC::SETRNDi) {
DebugLoc dl = MI.getDebugLoc();
Register OldFPSCRReg = MI.getOperand(0).getReg();
// Save FPSCR value.
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
// The floating point rounding mode is in the bits 62:63 of FPCSR, and has
// the following settings:
// 00 Round to nearest
// 01 Round to 0
// 10 Round to +inf
// 11 Round to -inf
// When the operand is immediate, using the two least significant bits of
// the immediate to set the bits 62:63 of FPSCR.
unsigned Mode = MI.getOperand(1).getImm();
BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
} else if (MI.getOpcode() == PPC::SETRND) {
DebugLoc dl = MI.getDebugLoc();
// Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
// or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
// If the target doesn't have DirectMove, we should use stack to do the
// conversion, because the target doesn't have the instructions like mtvsrd
// or mfvsrd to do this conversion directly.
auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
if (Subtarget.hasDirectMove()) {
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)
} else {
// Use stack to do the register copy.
unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
MachineRegisterInfo &RegInfo = F->getRegInfo();
const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);
if (RC == &PPC::F8RCRegClass) {
// Copy register from F8RCRegClass to G8RCRegclass.
assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
"Unsupported RegClass.");
StoreOp = PPC::STFD;
LoadOp = PPC::LD;
} else {
// Copy register from G8RCRegClass to F8RCRegclass.
assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
(RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
"Unsupported RegClass.");
MachineFrameInfo &MFI = F->getFrameInfo();
int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
MachineMemOperand *MMOStore = F->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
// Store the SrcReg into the stack.
BuildMI(*BB, MI, dl, TII->get(StoreOp))
MachineMemOperand *MMOLoad = F->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
// Load from the stack where SrcReg is stored, and save to DestReg,
// so we have done the RegClass conversion from RegClass::SrcReg to
// RegClass::DestReg.
BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)
Register OldFPSCRReg = MI.getOperand(0).getReg();
// Save FPSCR value.
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
// When the operand is gprc register, use two least significant bits of the
// register and mtfsf instruction to set the bits 62:63 of FPSCR.
// copy OldFPSCRTmpReg, OldFPSCRReg
// (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
// rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
// copy NewFPSCRReg, NewFPSCRTmpReg
// mtfsf 255, NewFPSCRReg
MachineOperand SrcOp = MI.getOperand(1);
MachineRegisterInfo &RegInfo = F->getRegInfo();
Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
// The first operand of INSERT_SUBREG should be a register which has
// subregisters, we only care about its RegClass, so we should use an
// IMPLICIT_DEF register.
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);
BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)
Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
// The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
// bits of FPSCR.
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))
} else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
return emitProbedAlloca(MI, BB);
} else {
llvm_unreachable("Unexpected instr type to insert");
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
// Target Optimization Hooks
static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
// For the estimates, convergence is quadratic, so we essentially double the
// number of digits correct after every iteration. For both FRE and FRSQRTE,
// the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
// this is 2^-14. IEEE float has 23 digits and double has 52 digits.
int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
if (VT.getScalarType() == MVT::f64)
return RefinementSteps;
SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled, int &RefinementSteps,
bool &UseOneConstNR,
bool Reciprocal) const {
EVT VT = Operand.getValueType();
if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
(VT == MVT::v2f64 && Subtarget.hasVSX()) ||
(VT == MVT::v4f32 && Subtarget.hasQPX()) ||
(VT == MVT::v4f64 && Subtarget.hasQPX())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
// The Newton-Raphson computation with a single constant does not provide
// enough accuracy on some CPUs.
UseOneConstNR = !Subtarget.needsTwoConstNR();
return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
return SDValue();
SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
int Enabled,
int &RefinementSteps) const {
EVT VT = Operand.getValueType();
if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
(VT == MVT::f64 && Subtarget.hasFRE()) ||
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
(VT == MVT::v2f64 && Subtarget.hasVSX()) ||
(VT == MVT::v4f32 && Subtarget.hasQPX()) ||
(VT == MVT::v4f64 && Subtarget.hasQPX())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
return SDValue();
unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
// Note: This functionality is used only when unsafe-fp-math is enabled, and
// on cores with reciprocal estimates (which are used when unsafe-fp-math is
// enabled for division), this functionality is redundant with the default
// combiner logic (once the division -> reciprocal/multiply transformation
// has taken place). As a result, this matters more for older cores than for
// newer ones.
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal if there are two or more FDIVs (for embedded cores with only
// one FP pipeline) for three or more FDIVs (for generic OOO cores).
switch (Subtarget.getCPUDirective()) {
return 3;
case PPC::DIR_440:
case PPC::DIR_A2:
case PPC::DIR_E500:
case PPC::DIR_E500mc:
case PPC::DIR_E5500:
return 2;
// isConsecutiveLSLoc needs to work even if all adds have not yet been
// collapsed, and so we need to look through chains of them.
static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
int64_t& Offset, SelectionDAG &DAG) {
if (DAG.isBaseWithConstantOffset(Loc)) {
Base = Loc.getOperand(0);
Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
// The base might itself be a base plus an offset, and if so, accumulate
// that as well.
getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
unsigned Bytes, int Dist,
SelectionDAG &DAG) {
if (VT.getSizeInBits() / 8 != Bytes)
return false;
SDValue BaseLoc = Base->getBasePtr();
if (Loc.getOpcode() == ISD::FrameIndex) {
if (BaseLoc.getOpcode() != ISD::FrameIndex)
return false;
const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
int FS = MFI.getObjectSize(FI);
int BFS = MFI.getObjectSize(BFI);
if (FS != BFS || FS != (int)Bytes) return false;
return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
SDValue Base1 = Loc, Base2 = BaseLoc;
int64_t Offset1 = 0, Offset2 = 0;
getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
return true;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const GlobalValue *GV1 = nullptr;
const GlobalValue *GV2 = nullptr;
Offset1 = 0;
Offset2 = 0;
bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
if (isGA1 && isGA2 && GV1 == GV2)
return Offset1 == (Offset2 + Dist*Bytes);
return false;
// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
// not enforce equality of the chain operands.
static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
unsigned Bytes, int Dist,
SelectionDAG &DAG) {
if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
EVT VT = LS->getMemoryVT();
SDValue Loc = LS->getBasePtr();
return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
default: return false;
case Intrinsic::ppc_qpx_qvlfd:
case Intrinsic::ppc_qpx_qvlfda:
VT = MVT::v4f64;
case Intrinsic::ppc_qpx_qvlfs:
case Intrinsic::ppc_qpx_qvlfsa:
VT = MVT::v4f32;
case Intrinsic::ppc_qpx_qvlfcd:
case Intrinsic::ppc_qpx_qvlfcda:
VT = MVT::v2f64;
case Intrinsic::ppc_qpx_qvlfcs:
case Intrinsic::ppc_qpx_qvlfcsa:
VT = MVT::v2f32;
case Intrinsic::ppc_qpx_qvlfiwa:
case Intrinsic::ppc_qpx_qvlfiwz:
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_vsx_lxvw4x:
case Intrinsic::ppc_vsx_lxvw4x_be:
VT = MVT::v4i32;
case Intrinsic::ppc_vsx_lxvd2x:
case Intrinsic::ppc_vsx_lxvd2x_be:
VT = MVT::v2f64;
case Intrinsic::ppc_altivec_lvebx:
VT = MVT::i8;
case Intrinsic::ppc_altivec_lvehx:
VT = MVT::i16;
case Intrinsic::ppc_altivec_lvewx:
VT = MVT::i32;
return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
if (N->getOpcode() == ISD::INTRINSIC_VOID) {
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
default: return false;
case Intrinsic::ppc_qpx_qvstfd:
case Intrinsic::ppc_qpx_qvstfda:
VT = MVT::v4f64;
case Intrinsic::ppc_qpx_qvstfs:
case Intrinsic::ppc_qpx_qvstfsa:
VT = MVT::v4f32;
case Intrinsic::ppc_qpx_qvstfcd:
case Intrinsic::ppc_qpx_qvstfcda:
VT = MVT::v2f64;
case Intrinsic::ppc_qpx_qvstfcs:
case Intrinsic::ppc_qpx_qvstfcsa:
VT = MVT::v2f32;
case Intrinsic::ppc_qpx_qvstfiw:
case Intrinsic::ppc_qpx_qvstfiwa:
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
case Intrinsic::ppc_vsx_stxvw4x:
VT = MVT::v4i32;
case Intrinsic::ppc_vsx_stxvd2x:
VT = MVT::v2f64;
case Intrinsic::ppc_vsx_stxvw4x_be:
VT = MVT::v4i32;
case Intrinsic::ppc_vsx_stxvd2x_be:
VT = MVT::v2f64;
case Intrinsic::ppc_altivec_stvebx:
VT = MVT::i8;
case Intrinsic::ppc_altivec_stvehx:
VT = MVT::i16;
case Intrinsic::ppc_altivec_stvewx:
VT = MVT::i32;
return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
return false;
// Return true is there is a nearyby consecutive load to the one provided
// (regardless of alignment). We search up and down the chain, looking though
// token factors and other loads (but nothing else). As a result, a true result
// indicates that it is safe to create a new consecutive load adjacent to the
// load provided.
static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
SDValue Chain = LD->getChain();
EVT VT = LD->getMemoryVT();
SmallSet<SDNode *, 16> LoadRoots;
SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
SmallSet<SDNode *, 16> Visited;
// First, search up the chain, branching to follow all token-factor operands.
// If we find a consecutive load, then we're done, otherwise, record all
// nodes just above the top-level loads and token factors.
while (!Queue.empty()) {
SDNode *ChainNext = Queue.pop_back_val();
if (!Visited.insert(ChainNext).second)
if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
return true;
if (!Visited.count(ChainLD->getChain().getNode()))
} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
for (const SDUse &O : ChainNext->ops())
if (!Visited.count(O.getNode()))
} else
// Second, search down the chain, starting from the top-level nodes recorded
// in the first phase. These top-level nodes are the nodes just above all
// loads and token factors. Starting with their uses, recursively look though
// all loads (just the chain uses) and token factors to find a consecutive
// load.
for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
IE = LoadRoots.end(); I != IE; ++I) {
while (!Queue.empty()) {
SDNode *LoadRoot = Queue.pop_back_val();
if (!Visited.insert(LoadRoot).second)
if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
return true;
for (SDNode::use_iterator UI = LoadRoot->use_begin(),
UE = LoadRoot->use_end(); UI != UE; ++UI)
if (((isa<MemSDNode>(*UI) &&
cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
return false;
/// This function is called when we have proved that a SETCC node can be replaced
/// by subtraction (and other supporting instructions) so that the result of
/// comparison is kept in a GPR instead of CR. This function is purely for
/// codegen purposes and has some flags to guide the codegen process.
static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
bool Swap, SDLoc &DL, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
// Zero extend the operands to the largest legal integer. Originally, they
// must be of a strictly smaller size.
auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
DAG.getConstant(Size, DL, MVT::i32));
auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
DAG.getConstant(Size, DL, MVT::i32));
// Swap if needed. Depends on the condition code.
if (Swap)
std::swap(Op0, Op1);
// Subtract extended integers.
auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);
// Move the sign bit to the least significant position and zero out the rest.
// Now the least significant bit carries the result of original comparison.
auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
DAG.getConstant(Size - 1, DL, MVT::i32));
auto Final = Shifted;
// Complement the result if needed. Based on the condition code.
if (Complement)
Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
DAG.getConstant(1, DL, MVT::i64));
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
// Size of integers being compared has a critical role in the following
// analysis, so we prefer to do this when all types are legal.
if (!DCI.isAfterLegalizeDAG())
return SDValue();
// If all users of SETCC extend its value to a legal integer type
// then we replace SETCC with a subtraction
for (SDNode::use_iterator UI = N->use_begin(),
UE = N->use_end(); UI != UE; ++UI) {
if (UI->getOpcode() != ISD::ZERO_EXTEND)
return SDValue();
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
auto OpSize = N->getOperand(0).getValueSizeInBits();
unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
if (OpSize < Size) {
switch (CC) {
default: break;
return generateEquivalentSub(N, Size, false, false, DL, DAG);
return generateEquivalentSub(N, Size, true, true, DL, DAG);
return generateEquivalentSub(N, Size, false, true, DL, DAG);
return generateEquivalentSub(N, Size, true, false, DL, DAG);
return SDValue();
SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
// If we're tracking CR bits, we need to be careful that we don't have:
// trunc(binary-ops(zext(x), zext(y)))
// or
// trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
// such that we're unnecessarily moving things into GPRs when it would be
// better to keep them in CR bits.
// Note that trunc here can be an actual i1 trunc, or can be the effective
// truncation that comes from a setcc or select_cc.
if (N->getOpcode() == ISD::TRUNCATE &&
N->getValueType(0) != MVT::i1)
return SDValue();
if (N->getOperand(0).getValueType() != MVT::i32 &&
N->getOperand(0).getValueType() != MVT::i64)
return SDValue();
if (N->getOpcode() == ISD::SETCC ||
N->getOpcode() == ISD::SELECT_CC) {
// If we're looking at a comparison, then we need to make sure that the
// high bits (all except for the first) don't matter the result.
ISD::CondCode CC =
N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
unsigned OpBits = N->getOperand(0).getValueSizeInBits();
if (ISD::isSignedIntSetCC(CC)) {
if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
return SDValue();
} else if (ISD::isUnsignedIntSetCC(CC)) {
if (!DAG.MaskedValueIsZero(N->getOperand(0),
APInt::getHighBitsSet(OpBits, OpBits-1)) ||
APInt::getHighBitsSet(OpBits, OpBits-1)))
return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
: SDValue());
} else {
// This is neither a signed nor an unsigned comparison, just make sure
// that the high bits are equal.
KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0));
KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));
// We don't really care about what is known about the first bit (if
// anything), so clear it in all masks prior to comparing them.
Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);
if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One)
return SDValue();
// We now know that the higher-order bits are irrelevant, we just need to
// make sure that all of the intermediate operations are bit operations, and
// all inputs are extensions.
if (N->getOperand(0).getOpcode() != ISD::AND &&
N->getOperand(0).getOpcode() != ISD::OR &&
N->getOperand(0).getOpcode() != ISD::XOR &&
N->getOperand(0).getOpcode() != ISD::SELECT &&
N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
return SDValue();
if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
N->getOperand(1).getOpcode() != ISD::AND &&
N->getOperand(1).getOpcode() != ISD::OR &&
N->getOperand(1).getOpcode() != ISD::XOR &&
N->getOperand(1).getOpcode() != ISD::SELECT &&
N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
return SDValue();
SmallVector<SDValue, 4> Inputs;
SmallVector<SDValue, 8> BinOps, PromOps;
SmallPtrSet<SDNode *, 16> Visited;
for (unsigned i = 0; i < 2; ++i) {
if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
if (N->getOpcode() == ISD::TRUNCATE)
// Visit all inputs, collect all binary operations (and, or, xor and
// select) that are all fed by extensions.
while (!BinOps.empty()) {
SDValue BinOp = BinOps.back();
if (!Visited.insert(BinOp.getNode()).second)
for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
// The condition of the select is not promoted.
if (BinOp.getOpcode() == ISD::SELECT && i == 0)
if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
isa<ConstantSDNode>(BinOp.getOperand(i))) {
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
BinOp.getOperand(i).getOpcode() == ISD::OR ||
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
} else {
// We have an input that is not an extension or another binary
// operation; we'll abort this transformation.
return SDValue();
// Make sure that this is a self-contained cluster of operations (which
// is not quite the same thing as saying that everything has only one
// use).
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
if (isa<ConstantSDNode>(Inputs[i]))
for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
UE = Inputs[i].getNode()->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
if (User != N && !Visited.count(User))
return SDValue();
// Make sure that we're not going to promote the non-output-value
// operand(s) or SELECT or SELECT_CC.
// FIXME: Although we could sometimes handle this, and it does occur in
// practice that one of the condition inputs to the select is also one of
// the outputs, we currently can't deal with this.
if (User->getOpcode() == ISD::SELECT) {
if (User->getOperand(0) == Inputs[i])
return SDValue();
} else if (User->getOpcode() == ISD::SELECT_CC) {
if (User->getOperand(0) == Inputs[i] ||
User->getOperand(1) == Inputs[i])
return SDValue();
for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
UE = PromOps[i].getNode()->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
if (User != N && !Visited.count(User))
return SDValue();
// Make sure that we're not going to promote the non-output-value
// operand(s) or SELECT or SELECT_CC.
// FIXME: Although we could sometimes handle this, and it does occur in
// practice that one of the condition inputs to the select is also one of
// the outputs, we currently can't deal with this.
if (User->getOpcode() == ISD::SELECT) {
if (User->getOperand(0) == PromOps[i])
return SDValue();
} else if (User->getOpcode() == ISD::SELECT_CC) {
if (User->getOperand(0) == PromOps[i] ||
User->getOperand(1) == PromOps[i])
return SDValue();
// Replace all inputs with the extension operand.
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
// Constants may have users outside the cluster of to-be-promoted nodes,
// and so we need to replace those as we do the promotions.
if (isa<ConstantSDNode>(Inputs[i]))
DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
std::list<HandleSDNode> PromOpHandles;
for (auto &PromOp : PromOps)
// Replace all operations (these are all the same, but have a different
// (i1) return type). DAG.getNode will validate that the types of
// a binary operator match, so go through the list in reverse so that
// we've likely promoted both operands first. Any intermediate truncations or
// extensions disappear.
while (!PromOpHandles.empty()) {
SDValue PromOp = PromOpHandles.back().getValue();
if (PromOp.getOpcode() == ISD::TRUNCATE ||
PromOp.getOpcode() == ISD::SIGN_EXTEND ||
PromOp.getOpcode() == ISD::ZERO_EXTEND ||
PromOp.getOpcode() == ISD::ANY_EXTEND) {
if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
PromOp.getOperand(0).getValueType() != MVT::i1) {
// The operand is not yet ready (see comment below).
SDValue RepValue = PromOp.getOperand(0);
if (isa<ConstantSDNode>(RepValue))
RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
unsigned C;
switch (PromOp.getOpcode()) {
default: C = 0; break;
case ISD::SELECT: C = 1; break;
case ISD::SELECT_CC: C = 2; break;
if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
PromOp.getOperand(C).getValueType() != MVT::i1) ||
(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
// The to-be-promoted operands of this node have not yet been
// promoted (this should be rare because we're going through the
// list backward, but if one of the operands has several users in
// this cluster of to-be-promoted nodes, it is possible).
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
// If there are any constant inputs, make sure they're replaced now.
for (unsigned i = 0; i < 2; ++i)
if (isa<ConstantSDNode>(Ops[C+i]))
Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
// Now we're left with the initial truncation itself.
if (N->getOpcode() == ISD::TRUNCATE)
return N->getOperand(0);
// Otherwise, this is a comparison. The operands to be compared have just
// changed type (to i1), but everything else is the same.
return SDValue(N, 0);
SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
// If we're tracking CR bits, we need to be careful that we don't have:
// zext(binary-ops(trunc(x), trunc(y)))
// or
// zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
// such that we're unnecessarily moving things into CR bits that can more
// efficiently stay in GPRs. Note that if we're not certain that the high
// bits are set as required by the final extension, we still may need to do
// some masking to get the proper behavior.
// This same functionality is important on PPC64 when dealing with
// 32-to-64-bit extensions; these occur often when 32-bit values are used as
// the return values of functions. Because it is so similar, it is handled
// here as well.
if (N->getValueType(0) != MVT::i32 &&
N->getValueType(0) != MVT::i64)
return SDValue();
if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
(N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
return SDValue();
if (N->getOperand(0).getOpcode() != ISD::AND &&
N->getOperand(0).getOpcode() != ISD::OR &&
N->getOperand(0).getOpcode() != ISD::XOR &&
N->getOperand(0).getOpcode() != ISD::SELECT &&
N->getOperand(0).getOpcode() != ISD::SELECT_CC)
return SDValue();
SmallVector<SDValue, 4> Inputs;
SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
SmallPtrSet<SDNode *, 16> Visited;
// Visit all inputs, collect all binary operations (and, or, xor and
// select) that are all fed by truncations.
while (!BinOps.empty()) {
SDValue BinOp = BinOps.back();
if (!Visited.insert(BinOp.getNode()).second)
for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
// The condition of the select is not promoted.
if (BinOp.getOpcode() == ISD::SELECT && i == 0)
if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
isa<ConstantSDNode>(BinOp.getOperand(i))) {
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
BinOp.getOperand(i).getOpcode() == ISD::OR ||
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
} else {
// We have an input that is not a truncation or another binary
// operation; we'll abort this transformation.
return SDValue();
// The operands of a select that must be truncated when the select is
// promoted because the operand is actually part of the to-be-promoted set.
DenseMap<SDNode *, EVT> SelectTruncOp[2];
// Make sure that this is a self-contained cluster of operations (which
// is not quite the same thing as saying that everything has only one
// use).
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
if (isa<ConstantSDNode>(Inputs[i]))
for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
UE = Inputs[i].getNode()->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
if (User != N && !Visited.count(User))
return SDValue();
// If we're going to promote the non-output-value operand(s) or SELECT or
// SELECT_CC, record them for truncation.
if (User->getOpcode() == ISD::SELECT) {
if (User->getOperand(0) == Inputs[i])
} else if (User->getOpcode() == ISD::SELECT_CC) {
if (User->getOperand(0) == Inputs[i])
if (User->getOperand(1) == Inputs[i])
for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
UE = PromOps[i].getNode()->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
if (User != N && !Visited.count(User))
return SDValue();
// If we're going to promote the non-output-value operand(s) or SELECT or
// SELECT_CC, record them for truncation.
if (User->getOpcode() == ISD::SELECT) {
if (User->getOperand(0) == PromOps[i])
} else if (User->getOpcode() == ISD::SELECT_CC) {
if (User->getOperand(0) == PromOps[i])
if (User->getOperand(1) == PromOps[i])
unsigned PromBits = N->getOperand(0).getValueSizeInBits();
bool ReallyNeedsExt = false;
if (N->getOpcode() != ISD::ANY_EXTEND) {
// If all of the inputs are not already sign/zero extended, then
// we'll still need to do that at the end.
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
if (isa<ConstantSDNode>(Inputs[i]))
unsigned OpBits =
assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
if ((N->getOpcode() == ISD::ZERO_EXTEND &&
OpBits-PromBits))) ||
(N->getOpcode() == ISD::SIGN_EXTEND &&
DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
(OpBits-(PromBits-1)))) {
ReallyNeedsExt = true;
// Replace all inputs, either with the truncation operand, or a
// truncation or extension to the final output type.
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
// Constant inputs need to be replaced with the to-be-promoted nodes that
// use them because they might have users outside of the cluster of
// promoted nodes.
if (isa<ConstantSDNode>(Inputs[i]))
SDValue InSrc = Inputs[i].getOperand(0);
if (Inputs[i].getValueType() == N->getValueType(0))
DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
else if (N->getOpcode() == ISD::SIGN_EXTEND)
DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
else if (N->getOpcode() == ISD::ZERO_EXTEND)
DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
std::list<HandleSDNode> PromOpHandles;
for (auto &PromOp : PromOps)
// Replace all operations (these are all the same, but have a different
// (promoted) return type). DAG.getNode will validate that the types of
// a binary operator match, so go through the list in reverse so that
// we've likely promoted both operands first.
while (!PromOpHandles.empty()) {
SDValue PromOp = PromOpHandles.back().getValue();
unsigned C;
switch (PromOp.getOpcode()) {
default: C = 0; break;
case ISD::SELECT: C = 1; break;
case ISD::SELECT_CC: C = 2; break;
if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
// The to-be-promoted operands of this node have not yet been
// promoted (this should be rare because we're going through the
// list backward, but if one of the operands has several users in
// this cluster of to-be-promoted nodes, it is possible).
// For SELECT and SELECT_CC nodes, we do a similar check for any
// to-be-promoted comparison inputs.
if (PromOp.getOpcode() == ISD::SELECT ||
PromOp.getOpcode() == ISD::SELECT_CC) {
if ((SelectTruncOp[0].count(PromOp.getNode()) &&
PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
(SelectTruncOp[1].count(PromOp.getNode()) &&
PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
// If this node has constant inputs, then they'll need to be promoted here.
for (unsigned i = 0; i < 2; ++i) {
if (!isa<ConstantSDNode>(Ops[C+i]))
if (Ops[C+i].getValueType() == N->getValueType(0))
if (N->getOpcode() == ISD::SIGN_EXTEND)
Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
else if (N->getOpcode() == ISD::ZERO_EXTEND)
Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
// If we've promoted the comparison inputs of a SELECT or SELECT_CC,
// truncate them again to the original value type.
if (PromOp.getOpcode() == ISD::SELECT ||
PromOp.getOpcode() == ISD::SELECT_CC) {
auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
if (SI0 != SelectTruncOp[0].end())
Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
if (SI1 != SelectTruncOp[1].end())
Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
// Now we're left with the initial extension itself.
if (!ReallyNeedsExt)
return N->getOperand(0);
// To zero extend, just mask off everything except for the first bit (in the
// i1 case).
if (N->getOpcode() == ISD::ZERO_EXTEND)
return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
N->getValueSizeInBits(0), PromBits),
dl, N->getValueType(0)));
assert(N->getOpcode() == ISD::SIGN_EXTEND &&
"Invalid extension type");
EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
SDValue ShiftCst =
DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
return DAG.getNode(
ISD::SRA, dl, N->getValueType(0),
DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
SDValue PPCTargetLowering::combineSetCC(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::SETCC &&
"Should be called with a SETCC node");
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
if (CC == ISD::SETNE || CC == ISD::SETEQ) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
// If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
std::swap(LHS, RHS);
// x == 0-y --> x+y == 0
// x != 0-y --> x+y != 0
if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
RHS.hasOneUse()) {
SDLoc DL(N);
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
EVT OpVT = LHS.getValueType();
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
return DAGCombineTruncBoolExt(N, DCI);
// Is this an extending load from an f32 to an f64?
static bool isFPExtLoad(SDValue Op) {
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
return LD->getExtensionType() == ISD::EXTLOAD &&
Op.getValueType() == MVT::f64;
return false;
/// Reduces the number of fp-to-int conversion when building a vector.
/// If this vector is built out of floating to integer conversions,
/// transform it to a vector built out of floating point values followed by a
/// single floating to integer conversion of the vector.
/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
/// becomes (fptosi (build_vector ($A, $B, ...)))
SDValue PPCTargetLowering::
combineElementTruncationToVectorTruncation(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
"Should be called with a BUILD_VECTOR node");
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
SDValue FirstInput = N->getOperand(0);
assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
"The input operand must be an fp-to-int conversion.");
// This combine happens after legalization so the fp_to_[su]i nodes are
// already converted to PPCSISD nodes.
unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
if (FirstConversion == PPCISD::FCTIDZ ||
FirstConversion == PPCISD::FCTIDUZ ||
FirstConversion == PPCISD::FCTIWZ ||
FirstConversion == PPCISD::FCTIWUZ) {
bool IsSplat = true;
bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
FirstConversion == PPCISD::FCTIWUZ;
EVT SrcVT = FirstInput.getOperand(0).getValueType();
SmallVector<SDValue, 4> Ops;
EVT TargetVT = N->getValueType(0);
for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
SDValue NextOp = N->getOperand(i);
if (NextOp.getOpcode() != PPCISD::MFVSR)
return SDValue();
unsigned NextConversion = NextOp.getOperand(0).getOpcode();
if (NextConversion != FirstConversion)
return SDValue();
// If we are converting to 32-bit integers, we need to add an FP_ROUND.
// This is not valid if the input was originally double precision. It is
// also not profitable to do unless this is an extending load in which
// case doing this combine will allow us to combine consecutive loads.
if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
return SDValue();
if (N->getOperand(i) != FirstInput)
IsSplat = false;
// If this is a splat, we leave it as-is since there will be only a single
// fp-to-int conversion followed by a splat of the integer. This is better
// for 32-bit and smaller ints and neutral for 64-bit ints.
if (IsSplat)
return SDValue();
// Now that we know we have the right type of node, get its operands
for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
SDValue In = N->getOperand(i).getOperand(0);
if (Is32Bit) {
// For 32-bit values, we need to add an FP_ROUND node (if we made it
// here, we know that all inputs are extending loads so this is safe).
if (In.isUndef())
else {
SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl,
MVT::f32, In.getOperand(0),
DAG.getIntPtrConstant(1, dl));
} else
Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
unsigned Opcode;
if (FirstConversion == PPCISD::FCTIDZ ||
FirstConversion == PPCISD::FCTIWZ)
Opcode = ISD::FP_TO_SINT;
Opcode = ISD::FP_TO_UINT;
EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
return DAG.getNode(Opcode, dl, TargetVT, BV);
return SDValue();
/// Reduce the number of loads when building a vector.
/// Building a vector out of multiple loads can be converted to a load
/// of the vector type if the loads are consecutive. If the loads are
/// consecutive but in descending order, a shuffle is added at the end
/// to reorder the vector.
static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
"Should be called with a BUILD_VECTOR node");
SDLoc dl(N);
// Return early for non byte-sized type, as they can't be consecutive.
if (!N->getValueType(0).getVectorElementType().isByteSized())
return SDValue();
bool InputsAreConsecutiveLoads = true;
bool InputsAreReverseConsecutive = true;
unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();
SDValue FirstInput = N->getOperand(0);
bool IsRoundOfExtLoad = false;
if (FirstInput.getOpcode() == ISD::FP_ROUND &&
FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0));
IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD;
// Not a build vector of (possibly fp_rounded) loads.
if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) ||
N->getNumOperands() == 1)
return SDValue();
for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
// If any inputs are fp_round(extload), they all must be.
if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
return SDValue();
SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
if (NextInput.getOpcode() != ISD::LOAD)
return SDValue();
SDValue PreviousInput =
IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput);
LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput);
// If any inputs are fp_round(extload), they all must be.
if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
return SDValue();
if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG))
InputsAreConsecutiveLoads = false;
if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG))
InputsAreReverseConsecutive = false;
// Exit early if the loads are neither consecutive nor reverse consecutive.
if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
return SDValue();
assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
"The loads cannot be both consecutive and reverse consecutive.");
SDValue FirstLoadOp =
IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput;
SDValue LastLoadOp =
IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) :
LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp);
LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp);
if (InputsAreConsecutiveLoads) {
assert(LD1 && "Input needs to be a LoadSDNode.");
return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
LD1->getBasePtr(), LD1->getPointerInfo(),
if (InputsAreReverseConsecutive) {
assert(LDL && "Input needs to be a LoadSDNode.");
SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
LDL->getBasePtr(), LDL->getPointerInfo(),
SmallVector<int, 16> Ops;
for (int i = N->getNumOperands() - 1; i >= 0; i--)
return DAG.getVectorShuffle(N->getValueType(0), dl, Load,
DAG.getUNDEF(N->getValueType(0)), Ops);
return SDValue();
// This function adds the required vector_shuffle needed to get
// the elements of the vector extract in the correct position
// as specified by the CorrectElems encoding.
static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
SDValue Input, uint64_t Elems,
uint64_t CorrectElems) {
SDLoc dl(N);
unsigned NumElems = Input.getValueType().getVectorNumElements();
SmallVector<int, 16> ShuffleMask(NumElems, -1);
// Knowing the element indices being extracted from the original
// vector and the order in which they're being inserted, just put
// them at element indices required for the instruction.
for (unsigned i = 0; i < N->getNumOperands(); i++) {
if (DAG.getDataLayout().isLittleEndian())
ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
CorrectElems = CorrectElems >> 8;
Elems = Elems >> 8;
SDValue Shuffle =
DAG.getVectorShuffle(Input.getValueType(), dl, Input,
DAG.getUNDEF(Input.getValueType()), ShuffleMask);
EVT VT = N->getValueType(0);
SDValue Conv = DAG.getBitcast(VT, Shuffle);
EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Conv,
// Look for build vector patterns where input operands come from sign
// extended vector_extract elements of specific indices. If the correct indices
// aren't used, add a vector shuffle to fix up the indices and create
// SIGN_EXTEND_INREG node which selects the vector sign extend instructions
// during instruction selection.
static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
// This array encodes the indices that the vector sign extend instructions
// extract from when extending from one type to another for both BE and LE.
// The right nibble of each byte corresponds to the LE incides.
// and the left nibble of each byte corresponds to the BE incides.
// For example: 0x3074B8FC byte->word
// For LE: the allowed indices are: 0x0,0x4,0x8,0xC
// For BE: the allowed indices are: 0x3,0x7,0xB,0xF
// For example: 0x000070F8 byte->double word
// For LE: the allowed indices are: 0x0,0x8
// For BE: the allowed indices are: 0x7,0xF
uint64_t TargetElems[] = {
0x3074B8FC, // b->w
0x000070F8, // b->d
0x10325476, // h->w
0x00003074, // h->d
0x00001032, // w->d
uint64_t Elems = 0;
int Index;
SDValue Input;
auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
if (!Op)
return false;
if (Op.getOpcode() != ISD::SIGN_EXTEND &&
Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
return false;
// A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
// of the right width.
SDValue Extract = Op.getOperand(0);
if (Extract.getOpcode() == ISD::ANY_EXTEND)
Extract = Extract.getOperand(0);
if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return false;
ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
if (!ExtOp)
return false;
Index = ExtOp->getZExtValue();
if (Input && Input != Extract.getOperand(0))
return false;
if (!Input)
Input = Extract.getOperand(0);
Elems = Elems << 8;
Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
Elems |= Index;
return true;
// If the build vector operands aren't sign extended vector extracts,
// of the same input vector, then return.
for (unsigned i = 0; i < N->getNumOperands(); i++) {
if (!isSExtOfVecExtract(N->getOperand(i))) {
return SDValue();
// If the vector extract indicies are not correct, add the appropriate
// vector_shuffle.
int TgtElemArrayIdx;
int InputSize = Input.getValueType().getScalarSizeInBits();
int OutputSize = N->getValueType(0).getScalarSizeInBits();
if (InputSize + OutputSize == 40)
TgtElemArrayIdx = 0;
else if (InputSize + OutputSize == 72)
TgtElemArrayIdx = 1;
else if (InputSize + OutputSize == 48)
TgtElemArrayIdx = 2;
else if (InputSize + OutputSize == 80)
TgtElemArrayIdx = 3;
else if (InputSize + OutputSize == 96)
TgtElemArrayIdx = 4;
return SDValue();
uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
CorrectElems = DAG.getDataLayout().isLittleEndian()
? CorrectElems & 0x0F0F0F0F0F0F0F0F
: CorrectElems & 0xF0F0F0F0F0F0F0F0;
if (Elems != CorrectElems) {
return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
// Regular lowering will catch cases where a shuffle is not needed.
return SDValue();
SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
"Should be called with a BUILD_VECTOR node");
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
if (!Subtarget.hasVSX())
return SDValue();
// The target independent DAG combiner will leave a build_vector of
// float-to-int conversions intact. We can generate MUCH better code for
// a float-to-int conversion of a vector of floats.
SDValue FirstInput = N->getOperand(0);
if (FirstInput.getOpcode() == PPCISD::MFVSR) {
SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
if (Reduced)
return Reduced;
// If we're building a vector out of consecutive loads, just load that
// vector type.
SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
if (Reduced)
return Reduced;
// If we're building a vector out of extended elements from another vector
// we have P9 vector integer extend instructions. The code assumes legal
// input types (i.e. it can't handle things like v4i16) so do not run before
// legalization.
if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
Reduced = combineBVOfVecSExt(N, DAG);
if (Reduced)
return Reduced;
if (N->getValueType(0) != MVT::v2f64)
return SDValue();
// Looking for:
// (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
FirstInput.getOpcode() != ISD::UINT_TO_FP)
return SDValue();
if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
return SDValue();
if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
return SDValue();
SDValue Ext1 = FirstInput.getOperand(0);
SDValue Ext2 = N->getOperand(1).getOperand(0);
if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
if (!Ext1Op || !Ext2Op)
return SDValue();
if (Ext1.getOperand(0).getValueType() != MVT::v4i32 ||
Ext1.getOperand(0) != Ext2.getOperand(0))
return SDValue();
int FirstElem = Ext1Op->getZExtValue();
int SecondElem = Ext2Op->getZExtValue();
int SubvecIdx;
if (FirstElem == 0 && SecondElem == 1)
SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
else if (FirstElem == 2 && SecondElem == 3)
SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
return SDValue();
SDValue SrcVec = Ext1.getOperand(0);
auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
return DAG.getNode(NodeType, dl, MVT::v2f64,
SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
DAGCombinerInfo &DCI) const {
assert((N->getOpcode() == ISD::SINT_TO_FP ||
N->getOpcode() == ISD::UINT_TO_FP) &&
"Need an int -> FP conversion node here");
if (useSoftFloat() || !Subtarget.has64BitSupport())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
SDValue Op(N, 0);
// Don't handle ppc_fp128 here or conversions that are out-of-range capable
// from the hardware.
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
return SDValue();
if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
return SDValue();
SDValue FirstOperand(Op.getOperand(0));
bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
(FirstOperand.getValueType() == MVT::i8 ||
FirstOperand.getValueType() == MVT::i16);
if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
bool DstDouble = Op.getValueType() == MVT::f64;
unsigned ConvOp = Signed ?
SDValue WidthConst =
DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
dl, false);
LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
DAG.getVTList(MVT::f64, MVT::Other),
Ops, MVT::i8, LDN->getMemOperand());
// For signed conversion, we need to sign-extend the value in the VSR
if (Signed) {
SDValue ExtOps[] = { Ld, WidthConst };
SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
} else
return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
// For i32 intermediate values, unfortunately, the conversion functions
// leave the upper 32 bits of the value are undefined. Within the set of
// scalar instructions, we have no method for zero- or sign-extending the
// value. Thus, we cannot handle i32 intermediate values here.
if (Op.getOperand(0).getValueType() == MVT::i32)
return SDValue();
assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
"UINT_TO_FP is supported only with FPCVT");
// If we have FCFIDS, then use it when converting to single-precision.
// Otherwise, convert to double-precision and then round.
unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
? MVT::f32
: MVT::f64;
// If we're converting from a float, to an int, and back to a float again,
// then we don't need the store/load pair at all.
if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
Subtarget.hasFPCVT()) ||
(Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
SDValue Src = Op.getOperand(0).getOperand(0);
if (Src.getValueType() == MVT::f32) {
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
} else if (Src.getValueType() != MVT::f64) {
// Make sure that we don't pick up a ppc_fp128 source value.
return SDValue();
unsigned FCTOp =
Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
FP = DAG.getNode(ISD::FP_ROUND, dl,
MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
return FP;
return SDValue();
// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
// builtins) into loads with swaps.
SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
SDValue Chain;
SDValue Base;
MachineMemOperand *MMO;
switch (N->getOpcode()) {
llvm_unreachable("Unexpected opcode for little endian VSX load");
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(N);
Chain = LD->getChain();
Base = LD->getBasePtr();
MMO = LD->getMemOperand();
// If the MMO suggests this isn't a load of a full vector, leave
// things alone. For a built-in, we have to make the change for
// correctness, so if there is a size problem that will be a bug.
if (MMO->getSize() < 16)
return SDValue();
MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
Chain = Intrin->getChain();
// Similarly to the store case below, Intrin->getBasePtr() doesn't get
// us what we want. Get operand 2 instead.
Base = Intrin->getOperand(2);
MMO = Intrin->getMemOperand();
MVT VecTy = N->getValueType(0).getSimpleVT();
// Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
// aligned and the type is a vector with elements up to 4 bytes
if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
VecTy.getScalarSizeInBits() <= 32) {
return SDValue();
SDValue LoadOps[] = { Chain, Base };
SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
DAG.getVTList(MVT::v2f64, MVT::Other),
LoadOps, MVT::v2f64, MMO);
Chain = Load.getValue(1);
SDValue Swap = DAG.getNode(
PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
// Add a bitcast if the resulting load type doesn't match v2f64.
if (VecTy != MVT::v2f64) {
SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
// Package {bitcast value, swap's chain} to match Load's shape.
return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
N, Swap.getValue(1));
return Swap;
// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
// builtins) into stores with swaps.
SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
SDValue Chain;
SDValue Base;
unsigned SrcOpnd;
MachineMemOperand *MMO;
switch (N->getOpcode()) {
llvm_unreachable("Unexpected opcode for little endian VSX store");
case ISD::STORE: {
StoreSDNode *ST = cast<StoreSDNode>(N);
Chain = ST->getChain();
Base = ST->getBasePtr();
MMO = ST->getMemOperand();
SrcOpnd = 1;
// If the MMO suggests this isn't a store of a full vector, leave
// things alone. For a built-in, we have to make the change for
// correctness, so if there is a size problem that will be a bug.
if (MMO->getSize() < 16)
return SDValue();
MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
Chain = Intrin->getChain();
// Intrin->getBasePtr() oddly does not get what we want.
Base = Intrin->getOperand(3);
MMO = Intrin->getMemOperand();
SrcOpnd = 2;
SDValue Src = N->getOperand(SrcOpnd);
MVT VecTy = Src.getValueType().getSimpleVT();
// Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
// aligned and the type is a vector with elements up to 4 bytes
if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
VecTy.getScalarSizeInBits() <= 32) {
return SDValue();
// All stores are done as v2f64 and possible bit cast.
if (VecTy != MVT::v2f64) {
Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
Chain = Swap.getValue(1);
SDValue StoreOps[] = { Chain, Swap, Base };
SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
StoreOps, VecTy, MMO);
return Store;
// Handle DAG combine for STORE (FP_TO_INT F).
SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
unsigned Opcode = N->getOperand(1).getOpcode();
assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT)
&& "Not a FP_TO_INT Instruction!");
SDValue Val = N->getOperand(1).getOperand(0);
EVT Op1VT = N->getOperand(1).getValueType();
EVT ResVT = Val.getValueType();
// Floating point types smaller than 32 bits are not legal on Power.
if (ResVT.getScalarSizeInBits() < 32)
return SDValue();
// Only perform combine for conversion to i64/i32 or power9 i16/i8.
bool ValidTypeForStoreFltAsInt =
(Op1VT == MVT::i32 || Op1VT == MVT::i64 ||
(Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Vector() ||
cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
return SDValue();
// Extend f32 values to f64
if (ResVT.getScalarSizeInBits() == 32) {
Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
// Set signed or unsigned conversion opcode.
unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ?
Val = DAG.getNode(ConvOpcode,
dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val);
// Set number of bytes being converted.
unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2),
DAG.getIntPtrConstant(ByteSize, dl, false),
DAG.getValueType(Op1VT) };
Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
DAG.getVTList(MVT::Other), Ops,
return Val;
static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
// Check that the source of the element keeps flipping
// (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
bool PrevElemFromFirstVec = Mask[0] < NumElts;
for (int i = 1, e = Mask.size(); i < e; i++) {
if (PrevElemFromFirstVec && Mask[i] < NumElts)
return false;
if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
return false;
PrevElemFromFirstVec = !PrevElemFromFirstVec;
return true;
static bool isSplatBV(SDValue Op) {
if (Op.getOpcode() != ISD::BUILD_VECTOR)
return false;
SDValue FirstOp;
// Find first non-undef input.
for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
FirstOp = Op.getOperand(i);
if (!FirstOp.isUndef())
// All inputs are undef or the same as the first non-undef input.
for (int i = 1, e = Op.getNumOperands(); i < e; i++)
if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
return false;
return true;
static SDValue isScalarToVec(SDValue Op) {
if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
return Op;
if (Op.getOpcode() != ISD::BITCAST)
return SDValue();
Op = Op.getOperand(0);
if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
return Op;
return SDValue();
static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
int LHSMaxIdx, int RHSMinIdx,
int RHSMaxIdx, int HalfVec) {
for (int i = 0, e = ShuffV.size(); i < e; i++) {
int Idx = ShuffV[i];
if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
ShuffV[i] += HalfVec;
// the original is:
// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
// In such a case, just change the shuffle mask to extract the element
// from the permuted index.
static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {
SDLoc dl(OrigSToV);
EVT VT = OrigSToV.getValueType();
assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
"Expecting a SCALAR_TO_VECTOR here");
SDValue Input = OrigSToV.getOperand(0);
if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Input.getOperand(1));
SDValue OrigVector = Input.getOperand(0);
// Can't handle non-const element indices or different vector types
// for the input to the extract and the output of the scalar_to_vector.
if (Idx && VT == OrigVector.getValueType()) {
SmallVector<int, 16> NewMask(VT.getVectorNumElements(), -1);
NewMask[VT.getVectorNumElements() / 2] = Idx->getZExtValue();
return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);
// On little endian subtargets, combine shuffles such as:
// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
// into:
// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
// because the latter can be matched to a single instruction merge.
// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
// to put the value into element zero. Adjust the shuffle mask so that the
// vector can remain in permuted form (to prevent a swap prior to a shuffle).
SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG) const {
SDValue LHS = SVN->getOperand(0);
SDValue RHS = SVN->getOperand(1);
auto Mask = SVN->getMask();
int NumElts = LHS.getValueType().getVectorNumElements();
SDValue Res(SVN, 0);
SDLoc dl(SVN);
// None of these combines are useful on big endian systems since the ISA
// already has a big endian bias.
if (!Subtarget.isLittleEndian() || !Subtarget.hasVSX())
return Res;
// If this is not a shuffle of a shuffle and the first element comes from
// the second vector, canonicalize to the commuted form. This will make it
// more likely to match one of the single instruction patterns.
if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
std::swap(LHS, RHS);
Res = DAG.getCommutedVectorShuffle(*SVN);
Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
// Adjust the shuffle mask if either input vector comes from a
// SCALAR_TO_VECTOR and keep the respective input vector in permuted
// form (to prevent the need for a swap).
SmallVector<int, 16> ShuffV(Mask.begin(), Mask.end());
SDValue SToVLHS = isScalarToVec(LHS);
SDValue SToVRHS = isScalarToVec(RHS);
if (SToVLHS || SToVRHS) {
int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
: SToVRHS.getValueType().getVectorNumElements();
int NumEltsOut = ShuffV.size();
// Initially assume that neither input is permuted. These will be adjusted
// accordingly if either input is.
int LHSMaxIdx = -1;
int RHSMinIdx = -1;
int RHSMaxIdx = -1;
int HalfVec = LHS.getValueType().getVectorNumElements() / 2;
// Get the permuted scalar to vector nodes for the source(s) that come from
if (SToVLHS) {
// Set up the values for the shuffle vector fixup.
LHSMaxIdx = NumEltsOut / NumEltsIn;
SToVLHS = getSToVPermuted(SToVLHS, DAG);
if (SToVLHS.getValueType() != LHS.getValueType())
SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);
if (SToVRHS) {
RHSMinIdx = NumEltsOut;
RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
SToVRHS = getSToVPermuted(SToVRHS, DAG);
if (SToVRHS.getValueType() != RHS.getValueType())
SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);
// Fix up the shuffle mask to reflect where the desired element actually is.
// The minimum and maximum indices that correspond to element zero for both
// the LHS and RHS are computed and will control which shuffle mask entries
// are to be changed. For example, if the RHS is permuted, any shuffle mask
// entries in the range [RHSMinIdx,RHSMaxIdx) will be incremented by
// HalfVec to refer to the corresponding element in the permuted vector.
fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,
Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
// We may have simplified away the shuffle. We won't be able to do anything
// further with it here.
if (!isa<ShuffleVectorSDNode>(Res))
return Res;
Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
// The common case after we commuted the shuffle is that the RHS is a splat
// and we have elements coming in from the splat at indices that are not
// conducive to using a merge.
// Example:
// vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
if (!isSplatBV(RHS))
return Res;
// We are looking for a mask such that all even elements are from
// one vector and all odd elements from the other.
if (!isAlternatingShuffMask(Mask, NumElts))
return Res;
// Adjust the mask so we are pulling in the same index from the splat
// as the index from the interesting vector in consecutive elements.
// Example (even elements from first vector):
// vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
if (Mask[0] < NumElts)
for (int i = 1, e = Mask.size(); i < e; i += 2)
ShuffV[i] = (ShuffV[i - 1] + NumElts);
// Example (odd elements from first vector):
// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
for (int i = 0, e = Mask.size(); i < e; i += 2)
ShuffV[i] = (ShuffV[i + 1] + NumElts);
// If the RHS has undefs, we need to remove them since we may have created
// a shuffle that adds those instead of the splat value.
SDValue SplatVal = cast<BuildVectorSDNode>(RHS.getNode())->getSplatValue();
RHS = DAG.getSplatBuildVector(RHS.getValueType(), dl, SplatVal);
Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
return Res;
SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
LSBaseSDNode *LSBase,
DAGCombinerInfo &DCI) const {
assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
"Not a reverse memop pattern!");
auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
auto Mask = SVN->getMask();
int i = 0;
auto I = Mask.rbegin();
auto E = Mask.rend();
for (; I != E; ++I) {
if (*I != i)
return false;
return true;
SelectionDAG &DAG = DCI.DAG;
EVT VT = SVN->getValueType(0);
if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
return SDValue();
// Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
// See comment in PPCVSXSwapRemoval.cpp.
// It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
if (!Subtarget.hasP9Vector())
return SDValue();
return SDValue();
if (LSBase->getOpcode() == ISD::LOAD) {
SDLoc dl(SVN);
SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
return DAG.getMemIntrinsicNode(
PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
LSBase->getMemoryVT(), LSBase->getMemOperand());
if (LSBase->getOpcode() == ISD::STORE) {
SDLoc dl(LSBase);
SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
return DAG.getMemIntrinsicNode(
PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
LSBase->getMemoryVT(), LSBase->getMemOperand());
llvm_unreachable("Expected a load or store node here");
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
switch (N->getOpcode()) {
default: break;
case ISD::ADD:
return combineADD(N, DCI);
case ISD::SHL:
return combineSHL(N, DCI);
case ISD::SRA:
return combineSRA(N, DCI);
case ISD::SRL:
return combineSRL(N, DCI);
case ISD::MUL:
return combineMUL(N, DCI);
case ISD::FMA:
return combineFMALike(N, DCI);
if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
return N->getOperand(0);
if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
return N->getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
if (C->isNullValue() || // 0 >>s V -> 0.
C->isAllOnesValue()) // -1 >>s V -> -1.
return N->getOperand(0);
return DAGCombineExtBoolTrunc(N, DCI);
return combineTRUNCATE(N, DCI);
case ISD::SETCC:
if (SDValue CSCC = combineSetCC(N, DCI))
return CSCC;
return DAGCombineTruncBoolExt(N, DCI);
return combineFPToIntToFP(N, DCI);
if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
return combineVectorShuffle(cast<ShuffleVectorSDNode>(N), DCI.DAG);
case ISD::STORE: {
EVT Op1VT = N->getOperand(1).getValueType();
unsigned Opcode = N->getOperand(1).getOpcode();
if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) {
SDValue Val= combineStoreFPToInt(N, DCI);
if (Val)
return Val;
if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
if (Val)
return Val;
// Turn STORE (BSWAP) -> sthbrx/stwbrx.
if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
N->getOperand(1).getNode()->hasOneUse() &&
(Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
(Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
// STBRX can only handle simple types and it makes no sense to store less
// two bytes in byte-reversed order.
EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
if (mVT.isExtended() || mVT.getSizeInBits() < 16)
SDValue BSwapOp = N->getOperand(1).getOperand(0);
// Do an any-extend to 32-bits if this is a half-word input.
if (BSwapOp.getValueType() == MVT::i16)
BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
// If the type of BSWAP operand is wider than stored memory width
// it need to be shifted to the right side before STBRX.
if (Op1VT.bitsGT(mVT)) {
int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
DAG.getConstant(Shift, dl, MVT::i32));
// Need to truncate if this is a bswap of i64 stored as i32/i16.
if (Op1VT == MVT::i64)
BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
SDValue Ops[] = {
N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
Ops, cast<StoreSDNode>(N)->getMemoryVT(),
// STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
// So it can increase the chance of CSE constant construction.
if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
// Need to sign-extended to 64-bits to handle negative values.
EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);
// DAG.getTruncStore() can't be used here because it doesn't accept
// the general (base + offset) addressing mode.
// So we use UpdateNodeOperands and setTruncatingStore instead.
DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
return SDValue(N, 0);
// For little endian, VSX stores require generating xxswapd/lxvd2x.
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
if (Op1VT.isSimple()) {
MVT StoreVT = Op1VT.getSimpleVT();
if (Subtarget.needsSwapsForVSXMemOps() &&
(StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
return expandVSXStoreForLE(N, DCI);
case ISD::LOAD: {
LoadSDNode *LD = cast<LoadSDNode>(N);
EVT VT = LD->getValueType(0);
// For little endian, VSX loads require generating lxvd2x/xxswapd.
// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
if (VT.isSimple()) {
MVT LoadVT = VT.getSimpleVT();
if (Subtarget.needsSwapsForVSXMemOps() &&
(LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
return expandVSXLoadForLE(N, DCI);
// We sometimes end up with a 64-bit integer load, from which we extract
// two single-precision floating-point numbers. This happens with
// std::complex<float>, and other similar structures, because of the way we
// canonicalize structure copies. However, if we lack direct moves,
// then the final bitcasts from the extracted integer values to the
// floating-point numbers turn into store/load pairs. Even with direct moves,
// just loading the two floating-point numbers is likely better.
auto ReplaceTwoFloatLoad = [&]() {
if (VT != MVT::i64)
return false;
if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
return false;
// We're looking for a sequence like this:
// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
// t16: i64 = srl t13, Constant:i32<32>
// t17: i32 = truncate t16
// t18: f32 = bitcast t17
// t19: i32 = truncate t13
// t20: f32 = bitcast t19
if (!LD->hasNUsesOfValue(2, 0))
return false;
auto UI = LD->use_begin();
while (UI.getUse().getResNo() != 0) ++UI;
SDNode *Trunc = *UI++;
while (UI.getUse().getResNo() != 0) ++UI;
SDNode *RightShift = *UI;
if (Trunc->getOpcode() != ISD::TRUNCATE)
std::swap(Trunc, RightShift);
if (Trunc->getOpcode() != ISD::TRUNCATE ||
Trunc->getValueType(0) != MVT::i32 ||
return false;
if (RightShift->getOpcode() != ISD::SRL ||
!isa<ConstantSDNode>(RightShift->getOperand(1)) ||
RightShift->getConstantOperandVal(1) != 32 ||
return false;
SDNode *Trunc2 = *RightShift->use_begin();
if (Trunc2->getOpcode() != ISD::TRUNCATE ||
Trunc2->getValueType(0) != MVT::i32 ||
return false;
SDNode *Bitcast = *Trunc->use_begin();
SDNode *Bitcast2 = *Trunc2->use_begin();
if (Bitcast->getOpcode() != ISD::BITCAST ||
Bitcast->getValueType(0) != MVT::f32)
return false;
if (Bitcast2->getOpcode() != ISD::BITCAST ||
Bitcast2->getValueType(0) != MVT::f32)
return false;
if (Subtarget.isLittleEndian())
std::swap(Bitcast, Bitcast2);
// Bitcast has the second float (in memory-layout order) and Bitcast2
// has the first one.
SDValue BasePtr = LD->getBasePtr();
if (LD->isIndexed()) {
assert(LD->getAddressingMode() == ISD::PRE_INC &&
"Non-pre-inc AM on PPC?");
BasePtr =
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
auto MMOFlags =
LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
LD->getPointerInfo(), LD->getAlignment(),
MMOFlags, LD->getAAInfo());
SDValue AddPtr =
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
BasePtr, DAG.getIntPtrConstant(4, dl));
SDValue FloatLoad2 = DAG.getLoad(
MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());
if (LD->isIndexed()) {
// Note that DAGCombine should re-form any pre-increment load(s) from
// what is produced here if that makes sense.
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
DCI.CombineTo(Bitcast2, FloatLoad);
DCI.CombineTo(Bitcast, FloatLoad2);
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
SDValue(FloatLoad2.getNode(), 1));
return true;
if (ReplaceTwoFloatLoad())
return SDValue(N, 0);
EVT MemVT = LD->getMemoryVT();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy);
if (LD->isUnindexed() && VT.isVector() &&
((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
// P8 and later hardware should just use LOAD.
!Subtarget.hasP8Vector() &&
(VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v4f32)) ||
(Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
LD->getAlign() >= ScalarABIAlignment)) &&
LD->getAlign() < ABIAlignment) {
// This is a type-legal unaligned Altivec or QPX load.
SDValue Chain = LD->getChain();
SDValue Ptr = LD->getBasePtr();
bool isLittleEndian = Subtarget.isLittleEndian();
// This implements the loading of unaligned vectors as described in
// the venerable Apple Velocity Engine overview. Specifically:
// The general idea is to expand a sequence of one or more unaligned
// loads into an alignment-based permutation-control instruction (lvsl
// or lvsr), a series of regular vector loads (which always truncate
// their input address to an aligned address), and a series of
// permutations. The results of these permutations are the requested
// loaded values. The trick is that the last "extra" load is not taken
// from the address you might suspect (sizeof(vector) bytes after the
// last requested load), but rather sizeof(vector) - 1 bytes after the
// last requested vector. The point of this is to avoid a page fault if
// the base address happened to be aligned. This works because if the
// base address is aligned, then adding less than a full vector length
// will cause the last vector in the sequence to be (re)loaded.
// Otherwise, the next vector will be fetched as you might suspect was
// necessary.
// We might be able to reuse the permutation generation from
// a different base address offset from this one by an aligned amount.
// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
// optimization later.
Intrinsic::ID Intr, IntrLD, IntrPerm;
MVT PermCntlTy, PermTy, LDTy;
if (Subtarget.hasAltivec()) {
Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr :
IntrLD = Intrinsic::ppc_altivec_lvx;
IntrPerm = Intrinsic::ppc_altivec_vperm;
PermCntlTy = MVT::v16i8;
PermTy = MVT::v4i32;
LDTy = MVT::v4i32;
} else {
Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
IntrPerm = Intrinsic::ppc_qpx_qvfperm;
PermCntlTy = MVT::v4f64;
PermTy = MVT::v4f64;
LDTy = MemVT.getSimpleVT();
SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
// Create the new MMO for the new base load. It is like the original MMO,
// but represents an area in memory almost twice the vector size centered
// on the original address. If the address is unaligned, we might start
// reading up to (sizeof(vector)-1) bytes below the address of the
// original unaligned load.
MachineFunction &MF = DAG.getMachineFunction();
MachineMemOperand *BaseMMO =
// Create the new base load.
SDValue LDXIntID =
DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
SDValue BaseLoad =
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
DAG.getVTList(PermTy, MVT::Other),
BaseLoadOps, LDTy, BaseMMO);
// Note that the value of IncOffset (which is provided to the next
// load's pointer info offset value, and thus used to calculate the
// alignment), and the value of IncValue (which is actually used to
// increment the pointer value) are different! This is because we
// require the next load to appear to be aligned, even though it
// is actually offset from the base pointer by a lesser amount.
int IncOffset = VT.getSizeInBits() / 8;
int IncValue = IncOffset;
// Walk (both up and down) the chain looking for another load at the real
// (aligned) offset (the alignment of the other load does not matter in
// this case). If found, then do not use the offset reduction trick, as
// that will prevent the loads from being later combined (as they would
// otherwise be duplicates).
if (!findConsecutiveLoad(LD, DAG))
SDValue Increment =
DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
MachineMemOperand *ExtraMMO =
1, 2*MemVT.getStoreSize()-1);
SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
SDValue ExtraLoad =
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
DAG.getVTList(PermTy, MVT::Other),
ExtraLoadOps, LDTy, ExtraMMO);
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
BaseLoad.getValue(1), ExtraLoad.getValue(1));
// Because vperm has a big-endian bias, we must reverse the order
// of the input vectors and complement the permute control vector
// when generating little endian code. We have already handled the
// latter by using lvsr instead of lvsl, so just reverse BaseLoad
// and ExtraLoad here.
SDValue Perm;
if (isLittleEndian)
Perm = BuildIntrinsicOp(IntrPerm,
ExtraLoad, BaseLoad, PermCntl, DAG, dl);
Perm = BuildIntrinsicOp(IntrPerm,
BaseLoad, ExtraLoad, PermCntl, DAG, dl);
if (VT != PermTy)
Perm = Subtarget.hasAltivec() ?
DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
DAG.getTargetConstant(1, dl, MVT::i64));
// second argument is 1 because this rounding
// is always exact.
// The output of the permutation is our loaded result, the TokenFactor is
// our new chain.
DCI.CombineTo(N, Perm, TF);
return SDValue(N, 0);
bool isLittleEndian = Subtarget.isLittleEndian();
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
: Intrinsic::ppc_altivec_lvsl);
if ((IID == Intr ||
IID == Intrinsic::ppc_qpx_qvlpcld ||
IID == Intrinsic::ppc_qpx_qvlpcls) &&
N->getOperand(1)->getOpcode() == ISD::ADD) {
SDValue Add = N->getOperand(1);
int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
5 /* 32 byte alignment */ : 4 /* 16 byte alignment */;
if (DAG.MaskedValueIsZero(Add->getOperand(1),
APInt::getAllOnesValue(Bits /* alignment */)
.zext(Add.getScalarValueSizeInBits()))) {
SDNode *BasePtr = Add->getOperand(0).getNode();
for (SDNode::use_iterator UI = BasePtr->use_begin(),
UE = BasePtr->use_end();
UI != UE; ++UI) {
if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
// We've found another LVSL/LVSR, and this address is an aligned
// multiple of that one. The results will be the same, so use the
// one we've just found instead.
return SDValue(*UI, 0);
if (isa<ConstantSDNode>(Add->getOperand(1))) {
SDNode *BasePtr = Add->getOperand(0).getNode();
for (SDNode::use_iterator UI = BasePtr->use_begin(),
UE = BasePtr->use_end(); UI != UE; ++UI) {
if (UI->getOpcode() == ISD::ADD &&
isa<ConstantSDNode>(UI->getOperand(1)) &&
(cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
(1ULL << Bits) == 0) {
SDNode *OtherAdd = *UI;
for (SDNode::use_iterator VI = OtherAdd->use_begin(),
VE = OtherAdd->use_end(); VI != VE; ++VI) {
if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
return SDValue(*VI, 0);
// Combine vmaxsw/h/b(a, a's negation) to abs(a)
// Expose the vabsduw/h/b opportunity for down stream
if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
(IID == Intrinsic::ppc_altivec_vmaxsw ||
IID == Intrinsic::ppc_altivec_vmaxsh ||
IID == Intrinsic::ppc_altivec_vmaxsb)) {
SDValue V1 = N->getOperand(1);
SDValue V2 = N->getOperand(2);
if ((V1.getSimpleValueType() == MVT::v4i32 ||
V1.getSimpleValueType() == MVT::v8i16 ||
V1.getSimpleValueType() == MVT::v16i8) &&
V1.getSimpleValueType() == V2.getSimpleValueType()) {
// (0-a, a)
if (V1.getOpcode() == ISD::SUB &&
ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
V1.getOperand(1) == V2) {
return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
// (a, 0-a)
if (V2.getOpcode() == ISD::SUB &&
ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
V2.getOperand(1) == V1) {
return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
// (x-y, y-x)
if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
V1.getOperand(0) == V2.getOperand(1) &&
V1.getOperand(1) == V2.getOperand(0)) {
return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
// For little endian, VSX loads require generating lxvd2x/xxswapd.
// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
if (Subtarget.needsSwapsForVSXMemOps()) {
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::ppc_vsx_lxvw4x:
case Intrinsic::ppc_vsx_lxvd2x:
return expandVSXLoadForLE(N, DCI);
// For little endian, VSX stores require generating xxswapd/stxvd2x.
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
if (Subtarget.needsSwapsForVSXMemOps()) {
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::ppc_vsx_stxvw4x:
case Intrinsic::ppc_vsx_stxvd2x:
return expandVSXStoreForLE(N, DCI);
case ISD::BSWAP:
// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
N->getOperand(0).hasOneUse() &&
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
(Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
N->getValueType(0) == MVT::i64))) {
SDValue Load = N->getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(Load);
// Create the byte-swapping load.
SDValue Ops[] = {
LD->getChain(), // Chain
LD->getBasePtr(), // Ptr
DAG.getValueType(N->getValueType(0)) // VT
SDValue BSLoad =
DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
DAG.getVTList(N->getValueType(0) == MVT::i64 ?
MVT::i64 : MVT::i32, MVT::Other),
Ops, LD->getMemoryVT(), LD->getMemOperand());
// If this is an i16 load, insert the truncate.
SDValue ResVal = BSLoad;
if (N->getValueType(0) == MVT::i16)
ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
// First, combine the bswap away. This makes the value produced by the
// load dead.
DCI.CombineTo(N, ResVal);
// Next, combine the load away, we give it a bogus result value but a real
// chain result. The result value is dead because the bswap is dead.
DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
// Return N so it doesn't get rechecked!
return SDValue(N, 0);
// If a VCMPo node already exists with exactly the same operands as this
// node, use its result instead of this node (VCMPo computes both a CR6 and
// a normal output).
if (!N->getOperand(0).hasOneUse() &&
!N->getOperand(1).hasOneUse() &&
!N->getOperand(2).hasOneUse()) {
// Scan all of the users of the LHS, looking for VCMPo's that match.
SDNode *VCMPoNode = nullptr;
SDNode *LHSN = N->getOperand(0).getNode();
for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
UI != E; ++UI)
if (UI->getOpcode() == PPCISD::VCMPo &&
UI->getOperand(1) == N->getOperand(1) &&
UI->getOperand(2) == N->getOperand(2) &&
UI->getOperand(0) == N->getOperand(0)) {
VCMPoNode = *UI;
// If there is no VCMPo node, or if the flag value has a single use, don't
// transform this.
if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
// Look at the (necessarily single) use of the flag value. If it has a
// chain, this transformation is more complex. Note that multiple things
// could use the value result, which we should ignore.
SDNode *FlagUser = nullptr;
for (SDNode::use_iterator UI = VCMPoNode->use_begin();
FlagUser == nullptr; ++UI) {
assert(UI != VCMPoNode->use_end() && "Didn't find user!");
SDNode *User = *UI;
for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
FlagUser = User;
// If the user is a MFOCRF instruction, we know this is safe.
// Otherwise we give up for right now.
if (FlagUser->getOpcode() == PPCISD::MFOCRF)
return SDValue(VCMPoNode, 0);
case ISD::BRCOND: {
SDValue Cond = N->getOperand(1);
SDValue Target = N->getOperand(2);
if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
Intrinsic::loop_decrement) {
// We now need to make the intrinsic dead (it cannot be instruction
// selected).
DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
assert(Cond.getNode()->hasOneUse() &&
"Counter decrement has more than one use");
return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
N->getOperand(0), Target);
case ISD::BR_CC: {
// If this is a branch on an altivec predicate comparison, lower this so
// that we don't have to do a MFOCRF: instead, branch directly on CR6. This
// lowering is done pre-legalize, because the legalizer lowers the predicate
// compare down to code that is difficult to reassemble.
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
// Sometimes the promoted value of the intrinsic is ANDed by some non-zero
// value. If so, pass-through the AND to get to the intrinsic.
if (LHS.getOpcode() == ISD::AND &&
LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
Intrinsic::loop_decrement &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
LHS = LHS.getOperand(0);
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
Intrinsic::loop_decrement &&
isa<ConstantSDNode>(RHS)) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
"Counter decrement comparison is not EQ or NE");
unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
bool isBDNZ = (CC == ISD::SETEQ && Val) ||
(CC == ISD::SETNE && !Val);
// We now need to make the intrinsic dead (it cannot be instruction
// selected).
DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
assert(LHS.getNode()->hasOneUse() &&
"Counter decrement has more than one use");
return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
N->getOperand(0), N->getOperand(4));
int CompareOpc;
bool isDot;
if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
assert(isDot && "Can't compare against a vector result!");
// If this is a comparison against something other than 0/1, then we know
// that the condition is never/always true.
unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
if (Val != 0 && Val != 1) {
if (CC == ISD::SETEQ) // Cond never true, remove branch.
return N->getOperand(0);
// Always !=, turn it into an unconditional branch.
return DAG.getNode(ISD::BR, dl, MVT::Other,
N->getOperand(0), N->getOperand(4));
bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
// Create the PPCISD altivec 'dot' comparison node.
SDValue Ops[] = {
LHS.getOperand(2), // LHS of compare
LHS.getOperand(3), // RHS of compare
DAG.getConstant(CompareOpc, dl, MVT::i32)
EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
// Unpack the result based on how the target uses it.
PPC::Predicate CompOpc;
switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
default: // Can't happen, don't crash on invalid number though.
case 0: // Branch on the value of the EQ bit of CR6.
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
case 1: // Branch on the inverted value of the EQ bit of CR6.
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
case 2: // Branch on the value of the LT bit of CR6.
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
case 3: // Branch on the inverted value of the LT bit of CR6.
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
DAG.getConstant(CompOpc, dl, MVT::i32),
DAG.getRegister(PPC::CR6, MVT::i32),
N->getOperand(4), CompNode.getValue(1));
return DAGCombineBuildVector(N, DCI);
case ISD::ABS:
return combineABS(N, DCI);
return combineVSelect(N, DCI);
return SDValue();
PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const {
// fold (sdiv X, pow2)
EVT VT = N->getValueType(0);
if (VT == MVT::i64 && !Subtarget.isPPC64())
return SDValue();
if ((VT != MVT::i32 && VT != MVT::i64) ||
!(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
return SDValue();
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
bool IsNegPow2 = (-Divisor).isPowerOf2();
unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);
SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
if (IsNegPow2) {
Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
return Op;
// Inline Assembly Support
void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
switch (Op.getOpcode()) {
default: break;
case PPCISD::LBRX: {
// lhbrx is known to have the top bits cleared out.
if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
Known.Zero = 0xFFFF0000;
switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
default: break;
case Intrinsic::ppc_altivec_vcmpbfp_p:
case Intrinsic::ppc_altivec_vcmpeqfp_p:
case Intrinsic::ppc_altivec_vcmpequb_p:
case Intrinsic::ppc_altivec_vcmpequh_p:
case Intrinsic::ppc_altivec_vcmpequw_p:
case Intrinsic::ppc_altivec_vcmpequd_p:
case Intrinsic::ppc_altivec_vcmpgefp_p:
case Intrinsic::ppc_altivec_vcmpgtfp_p:
case Intrinsic::ppc_altivec_vcmpgtsb_p:
case Intrinsic::ppc_altivec_vcmpgtsh_p:
case Intrinsic::ppc_altivec_vcmpgtsw_p:
case Intrinsic::ppc_altivec_vcmpgtsd_p:
case Intrinsic::ppc_altivec_vcmpgtub_p:
case Intrinsic::ppc_altivec_vcmpgtuh_p:
case Intrinsic::ppc_altivec_vcmpgtuw_p:
case Intrinsic::ppc_altivec_vcmpgtud_p:
Known.Zero = ~1U; // All bits but the low one are known to be zero.
Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
switch (Subtarget.getCPUDirective()) {
default: break;
case PPC::DIR_970:
case PPC::DIR_PWR4:
case PPC::DIR_PWR5:
case PPC::DIR_PWR5X:
case PPC::DIR_PWR6:
case PPC::DIR_PWR6X:
case PPC::DIR_PWR7:
case PPC::DIR_PWR8:
case PPC::DIR_PWR9:
case PPC::DIR_PWR10:
if (!ML)
if (!DisableInnermostLoopAlign32) {
// If the nested loop is an innermost loop, prefer to a 32-byte alignment,
// so that we can decrease cache misses and branch-prediction misses.
// Actual alignment of the loop will depend on the hotness check and other
// logic in alignBlocks.
if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
return Align(32);
const PPCInstrInfo *TII = Subtarget.getInstrInfo();
// For small loops (between 5 and 8 instructions), align to a 32-byte
// boundary so that the entire loop fits in one instruction-cache line.
uint64_t LoopSize = 0;
for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) {
LoopSize += TII->getInstSizeInBytes(*J);
if (LoopSize > 32)
if (LoopSize > 16 && LoopSize <= 32)
return Align(32);
return TargetLowering::getPrefLoopAlignment(ML);
/// getConstraintType - Given a constraint, return the type of
/// constraint it is for this target.
PPCTargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
default: break;
case 'b':
case 'r':
case 'f':
case 'd':
case 'v':
case 'y':
return C_RegisterClass;
case 'Z':
// FIXME: While Z does indicate a memory constraint, it specifically
// indicates an r+r address (used in conjunction with the 'y' modifier
// in the replacement string). Currently, we're forcing the base
// register to be r0 in the asm printer (which is interpreted as zero)
// and forming the complete address in the second register. This is
// suboptimal.
return C_Memory;
} else if (Constraint == "wc") { // individual CR bits.
return C_RegisterClass;
} else if (Constraint == "wa" || Constraint == "wd" ||
Constraint == "wf" || Constraint == "ws" ||
Constraint == "wi" || Constraint == "ww") {
return C_RegisterClass; // VSX registers.
return TargetLowering::getConstraintType(Constraint);
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
return CW_Register; // an individual CR bit.
else if ((StringRef(constraint) == "wa" ||
StringRef(constraint) == "wd" ||
StringRef(constraint) == "wf") &&
return CW_Register;
else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
return CW_Register; // just hold 64-bit integers data.
else if (StringRef(constraint) == "ws" && type->isDoubleTy())
return CW_Register;
else if (StringRef(constraint) == "ww" && type->isFloatTy())
return CW_Register;
switch (*constraint) {
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
case 'b':
if (type->isIntegerTy())
weight = CW_Register;
case 'f':
if (type->isFloatTy())
weight = CW_Register;
case 'd':
if (type->isDoubleTy())
weight = CW_Register;
case 'v':
if (type->isVectorTy())
weight = CW_Register;
case 'y':
weight = CW_Register;
case 'Z':
weight = CW_Memory;
return weight;
std::pair<unsigned, const TargetRegisterClass *>
PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
if (Constraint.size() == 1) {
// GCC RS6000 Constraint Letters
switch (Constraint[0]) {
case 'b': // R1-R31
if (VT == MVT::i64 && Subtarget.isPPC64())
return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
case 'r': // R0-R31
if (VT == MVT::i64 && Subtarget.isPPC64())
return std::make_pair(0U, &PPC::G8RCRegClass);
return std::make_pair(0U, &PPC::GPRCRegClass);
// 'd' and 'f' constraints are both defined to be "the floating point
// registers", where one is for 32-bit and the other for 64-bit. We don't
// really care overly much here so just give them all the same reg classes.
case 'd':
case 'f':
if (Subtarget.hasSPE()) {
if (VT == MVT::f32 || VT == MVT::i32)
return std::make_pair(0U, &PPC::GPRCRegClass);
if (VT == MVT::f64 || VT == MVT::i64)
return std::make_pair(0U, &PPC::SPERCRegClass);
} else {
if (VT == MVT::f32 || VT == MVT::i32)
return std::make_pair(0U, &PPC::F4RCRegClass);
if (VT == MVT::f64 || VT == MVT::i64)
return std::make_pair(0U, &PPC::F8RCRegClass);
if (VT == MVT::v4f64 && Subtarget.hasQPX())
return std::make_pair(0U, &PPC::QFRCRegClass);
if (VT == MVT::v4f32 && Subtarget.hasQPX())
return std::make_pair(0U, &PPC::QSRCRegClass);
case 'v':
if (VT == MVT::v4f64 && Subtarget.hasQPX())
return std::make_pair(0U, &PPC::QFRCRegClass);
if (VT == MVT::v4f32 && Subtarget.hasQPX())
return std::make_pair(0U, &PPC::QSRCRegClass);
if (Subtarget.hasAltivec())
return std::make_pair(0U, &PPC::VRRCRegClass);
case 'y': // crrc
return std::make_pair(0U, &PPC::CRRCRegClass);
} else if (Constraint == "wc" && Subtarget.useCRBits()) {
// An individual CR bit.
return std::make_pair(0U, &PPC::CRBITRCRegClass);
} else if ((Constraint == "wa" || Constraint == "wd" ||
Constraint == "wf" || Constraint == "wi") &&
Subtarget.hasVSX()) {
return std::make_pair(0U, &PPC::VSRCRegClass);
} else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
if (VT == MVT::f32 && Subtarget.hasP8Vector())
return std::make_pair(0U, &PPC::VSSRCRegClass);
return std::make_pair(0U, &PPC::VSFRCRegClass);
// If we name a VSX register, we can't defer to the base class because it
// will not recognize the correct register (their names will be VSL{0-31}
// and V{0-31} so they won't match). So we match them here.
if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
int VSNum = atoi( + 3);
assert(VSNum >= 0 && VSNum <= 63 &&
"Attempted to access a vsr out of range");
if (VSNum < 32)
return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
std::pair<unsigned, const TargetRegisterClass *> R =
TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
// (which we call X[0-9]+). If a 64-bit value has been requested, and a
// 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
// register.
// FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
// the AsmName field from *, then this would not be necessary.
if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
return std::make_pair(TRI->getMatchingSuperReg(R.first,
PPC::sub_32, &PPC::G8RCRegClass),
// GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
R.first = PPC::CR0;
R.second = &PPC::CRRCRegClass;
return R;
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
SelectionDAG &DAG) const {
SDValue Result;
// Only support length 1 constraints.
if (Constraint.length() > 1) return;
char Letter = Constraint[0];
switch (Letter) {
default: break;
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P': {
ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
if (!CST) return; // Must be an immediate to match.
SDLoc dl(Op);
int64_t Value = CST->getSExtValue();
EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
// numbers are printed as such.
switch (Letter) {
default: llvm_unreachable("Unknown constraint letter!");
case 'I': // "I" is a signed 16-bit constant.
if (isInt<16>(Value))
Result = DAG.getTargetConstant(Value, dl, TCVT);
case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
if (isShiftedUInt<16, 16>(Value))
Result = DAG.getTargetConstant(Value, dl, TCVT);
case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
if (isShiftedInt<16, 16>(Value))
Result = DAG.getTargetConstant(Value, dl, TCVT);
case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
if (isUInt<16>(Value))
Result = DAG.getTargetConstant(Value, dl, TCVT);
case 'M': // "M" is a constant that is greater than 31.
if (Value > 31)
Result = DAG.getTargetConstant(Value, dl, TCVT);
case 'N': // "N" is a positive constant that is an exact power of two.
if (Value > 0 && isPowerOf2_64(Value))
Result = DAG.getTargetConstant(Value, dl, TCVT);
case 'O': // "O" is the constant zero.
if (Value == 0)
Result = DAG.getTargetConstant(Value, dl, TCVT);
case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
if (isInt<16>(-Value))
Result = DAG.getTargetConstant(Value, dl, TCVT);
if (Result.getNode()) {
// Handle standard constraint letters.
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
// isLegalAddressingMode - Return true if the addressing mode represented
// by AM is legal for this target, for a load/store of the specified type.
bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS, Instruction *I) const {
// PPC does not allow r+i addressing modes for vectors!
if (Ty->isVectorTy() && AM.BaseOffs != 0)
return false;
// PPC allows a sign-extended 16-bit immediate field.
if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
return false;
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
// PPC only support r+r,
switch (AM.Scale) {
case 0: // "r+i" or just "i", depending on HasBaseReg.
case 1:
if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
return false;
// Otherwise we have r+r or r+i.
case 2:
if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
return false;
// Allow 2*r as r+r.
// No other scales are supported.
return false;
return true;
SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
SDLoc dl(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
// Make sure the function does not optimize away the store of the RA to
// the stack.
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
bool isPPC64 = Subtarget.isPPC64();
auto PtrVT = getPointerTy(MF.getDataLayout());
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
SDValue Offset =
DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
isPPC64 ? MVT::i64 : MVT::i32);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
// Just load the return address off the stack.
SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT PtrVT = getPointerTy(MF.getDataLayout());
bool isPPC64 = PtrVT == MVT::i64;
// Naked functions never have a frame pointer, and so we use r1. For all
// other functions, this decision must be delayed until during PEI.
unsigned FrameReg;
if (MF.getFunction().hasFnAttribute(Attribute::Naked))
FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
while (Depth--)
FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
FrameAddr, MachinePointerInfo());
return FrameAddr;
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
bool isPPC64 = Subtarget.isPPC64();
bool is64Bit = isPPC64 && VT == LLT::scalar(64);
if (!is64Bit && VT != LLT::scalar(32))
report_fatal_error("Invalid register global variable type");
Register Reg = StringSwitch<Register>(RegName)
.Case("r1", is64Bit ? PPC::X1 : PPC::R1)
.Case("r2", isPPC64 ? Register() : PPC::R2)
.Case("r13", (is64Bit ? PPC::X13 : PPC::R13))
if (Reg)
return Reg;
report_fatal_error("Invalid register name global variable");
bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
// 32-bit SVR4 ABI access everything as got-indirect.
if (Subtarget.is32BitELFABI())
return true;
// AIX accesses everything indirectly through the TOC, which is similar to
// the GOT.
if (Subtarget.isAIXABI())
return true;
CodeModel::Model CModel = getTargetMachine().getCodeModel();
// If it is small or large code model, module locals are accessed
// indirectly by loading their address from .toc/.got.
if (CModel == CodeModel::Small || CModel == CodeModel::Large)
return true;
// JumpTable and BlockAddress are accessed as got-indirect.
if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
return true;
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))
return Subtarget.isGVIndirectSymbol(G->getGlobal());
return false;
PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// The PowerPC target isn't yet aware of offsets.
return false;
bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
case Intrinsic::ppc_qpx_qvlfd:
case Intrinsic::ppc_qpx_qvlfs:
case Intrinsic::ppc_qpx_qvlfcd:
case Intrinsic::ppc_qpx_qvlfcs:
case Intrinsic::ppc_qpx_qvlfiwa:
case Intrinsic::ppc_qpx_qvlfiwz:
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_altivec_lvebx:
case Intrinsic::ppc_altivec_lvehx:
case Intrinsic::ppc_altivec_lvewx:
case Intrinsic::ppc_vsx_lxvd2x:
case Intrinsic::ppc_vsx_lxvw4x: {
switch (Intrinsic) {
case Intrinsic::ppc_altivec_lvebx:
VT = MVT::i8;
case Intrinsic::ppc_altivec_lvehx:
VT = MVT::i16;
case Intrinsic::ppc_altivec_lvewx:
VT = MVT::i32;
case Intrinsic::ppc_vsx_lxvd2x:
VT = MVT::v2f64;
case Intrinsic::ppc_qpx_qvlfd:
VT = MVT::v4f64;
case Intrinsic::ppc_qpx_qvlfs:
VT = MVT::v4f32;
case Intrinsic::ppc_qpx_qvlfcd:
VT = MVT::v2f64;
case Intrinsic::ppc_qpx_qvlfcs:
VT = MVT::v2f32;
VT = MVT::v4i32;
Info.memVT = VT;
Info.ptrVal = I.getArgOperand(0);
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
Info.align = Align(1);
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::ppc_qpx_qvlfda:
case Intrinsic::ppc_qpx_qvlfsa:
case Intrinsic::ppc_qpx_qvlfcda:
case Intrinsic::ppc_qpx_qvlfcsa:
case Intrinsic::ppc_qpx_qvlfiwaa:
case Intrinsic::ppc_qpx_qvlfiwza: {
switch (Intrinsic) {
case Intrinsic::ppc_qpx_qvlfda:
VT = MVT::v4f64;
case Intrinsic::ppc_qpx_qvlfsa:
VT = MVT::v4f32;
case Intrinsic::ppc_qpx_qvlfcda:
VT = MVT::v2f64;
case Intrinsic::ppc_qpx_qvlfcsa:
VT = MVT::v2f32;
VT = MVT::v4i32;
Info.memVT = VT;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.size = VT.getStoreSize();
Info.align = Align(1);
Info.flags = MachineMemOperand::MOLoad;
return true;
case Intrinsic::ppc_qpx_qvstfd:
case Intrinsic::ppc_qpx_qvstfs:
case Intrinsic::ppc_qpx_qvstfcd:
case Intrinsic::ppc_qpx_qvstfcs:
case Intrinsic::ppc_qpx_qvstfiw:
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
case Intrinsic::ppc_altivec_stvebx:
case Intrinsic::ppc_altivec_stvehx:
case Intrinsic::ppc_altivec_stvewx:
case Intrinsic::ppc_vsx_stxvd2x:
case Intrinsic::ppc_vsx_stxvw4x: {
switch (Intrinsic) {
case Intrinsic::ppc_altivec_stvebx:
VT = MVT::i8;
case Intrinsic::ppc_altivec_stvehx:
VT = MVT::i16;
case Intrinsic::ppc_altivec_stvewx:
VT = MVT::i32;
case Intrinsic::ppc_vsx_stxvd2x:
VT = MVT::v2f64;
case Intrinsic::ppc_qpx_qvstfd:
VT = MVT::v4f64;
case Intrinsic::ppc_qpx_qvstfs:
VT = MVT::v4f32;
case Intrinsic::ppc_qpx_qvstfcd:
VT = MVT::v2f64;
case Intrinsic::ppc_qpx_qvstfcs:
VT = MVT::v2f32;
VT = MVT::v4i32;
Info.memVT = VT;
Info.ptrVal = I.getArgOperand(1);
Info.offset = -VT.getStoreSize()+1;
Info.size = 2*VT.getStoreSize()-1;
Info.align = Align(1);
Info.flags = MachineMemOperand::MOStore;
return true;
case Intrinsic::ppc_qpx_qvstfda:
case Intrinsic::ppc_qpx_qvstfsa:
case Intrinsic::ppc_qpx_qvstfcda:
case Intrinsic::ppc_qpx_qvstfcsa:
case Intrinsic::ppc_qpx_qvstfiwa: {
switch (Intrinsic) {
case Intrinsic::ppc_qpx_qvstfda:
VT = MVT::v4f64;
case Intrinsic::ppc_qpx_qvstfsa:
VT = MVT::v4f32;
case Intrinsic::ppc_qpx_qvstfcda:
VT = MVT::v2f64;
case Intrinsic::ppc_qpx_qvstfcsa:
VT = MVT::v2f32;
VT = MVT::v4i32;
Info.memVT = VT;
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
Info.size = VT.getStoreSize();
Info.align = Align(1);
Info.flags = MachineMemOperand::MOStore;
return true;
return false;
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
EVT PPCTargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
// When expanding a memset, require at least two QPX instructions to cover
// the cost of loading the value to be stored from the constant pool.
if (Subtarget.hasQPX() && Op.size() >= 32 &&
(Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) &&
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
return MVT::v4f64;
// We should use Altivec/VSX loads and stores when available. For unaligned
// addresses, unaligned VSX loads are only fast starting with the P8.
if (Subtarget.hasAltivec() && Op.size() >= 16 &&
(Op.isAligned(Align(16)) ||
((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
return MVT::v4i32;
if (Subtarget.isPPC64()) {
return MVT::i64;
return MVT::i32;
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
unsigned BitSize = Ty->getPrimitiveSizeInBits();
return !(BitSize == 0 || BitSize > 64);
bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 == 64 && NumBits2 == 32;
bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (!VT1.isInteger() || !VT2.isInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 == 64 && NumBits2 == 32;
bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
// Generally speaking, zexts are not free, but they are free when they can be
// folded with other operations.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
EVT MemVT = LD->getMemoryVT();
if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
(Subtarget.isPPC64() && MemVT == MVT::i32)) &&
(LD->getExtensionType() == ISD::NON_EXTLOAD ||
LD->getExtensionType() == ISD::ZEXTLOAD))
return true;
// FIXME: Add other cases...
// - 32-bit shifts with a zext to i64
// - zext after ctlz, bswap, etc.
// - zext after and by a constant mask
return TargetLowering::isZExtFree(Val, VT2);
bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
"invalid fpext types");
// Extending to float128 is not free.
if (DestVT == MVT::f128)
return false;
return true;
bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return isInt<16>(Imm) || isUInt<16>(Imm);
bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
return isInt<16>(Imm) || isUInt<16>(Imm);
bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
bool *Fast) const {
if (DisablePPCUnaligned)
return false;
// PowerPC supports unaligned memory access for simple non-vector types.
// Although accessing unaligned addresses is not as efficient as accessing
// aligned addresses, it is generally more efficient than manual expansion,
// and generally only traps for software emulation when crossing page
// boundaries.
if (!VT.isSimple())
return false;
if (VT.isFloatingPoint() && !VT.isVector() &&
return false;
if (VT.getSimpleVT().isVector()) {
if (Subtarget.hasVSX()) {
if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
VT != MVT::v4f32 && VT != MVT::v4i32)
return false;
} else {
return false;
if (VT == MVT::ppcf128)
return false;
if (Fast)
*Fast = true;
return true;
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
return isFMAFasterThanFMulAndFAdd(
MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext()));
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
Type *Ty) const {
switch (Ty->getScalarType()->getTypeID()) {
case Type::FloatTyID:
case Type::DoubleTyID:
return true;
case Type::FP128TyID:
return Subtarget.hasP9Vector();
return false;
// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist.
// FIXME: add more patterns which are profitable to hoist.
bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
if (I->getOpcode() != Instruction::FMul)
return true;
if (!I->hasOneUse())
return true;
Instruction *User = I->user_back();
assert(User && "A single use instruction with no uses.");
if (User->getOpcode() != Instruction::FSub &&
User->getOpcode() != Instruction::FAdd)
return true;
const TargetOptions &Options = getTargetMachine().Options;
const Function *F = I->getFunction();
const DataLayout &DL = F->getParent()->getDataLayout();
Type *Ty = User->getOperand(0)->getType();
return !(
isFMAFasterThanFMulAndFAdd(*F, Ty) &&
isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
const MCPhysReg *
PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
// LR is a callee-save register, but we must treat it as clobbered by any call
// site. Hence we include LR in the scratch registers, which are in turn added
// as implicit-defs for stackmaps and patchpoints. The same reasoning applies
// to CTR, which is used by any indirect call.
static const MCPhysReg ScratchRegs[] = {
PPC::X12, PPC::LR8, PPC::CTR8, 0
return ScratchRegs;
Register PPCTargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
Register PPCTargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
EVT VT , unsigned DefinedValues) const {
if (VT == MVT::v2i64)
return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
if (Subtarget.hasVSX() || Subtarget.hasQPX())
return true;
return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
if (DisableILPPref || Subtarget.enableMachineScheduler())
return TargetLowering::getSchedulingPreference(N);
return Sched::ILP;
// Create a fast isel object.
FastISel *
PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo) const {
return PPC::createFastISel(FuncInfo, LibInfo);
// 'Inverted' means the FMA opcode after negating one multiplicand.
// For example, (fma -a b c) = (fnmsub a b c)
static unsigned invertFMAOpcode(unsigned Opc) {
switch (Opc) {
llvm_unreachable("Invalid FMA opcode for PowerPC!");
case ISD::FMA:
return ISD::FMA;
SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
bool LegalOps, bool OptForSize,
NegatibleCost &Cost,
unsigned Depth) const {
if (Depth > SelectionDAG::MaxRecursionDepth)
return SDValue();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
SDNodeFlags Flags = Op.getNode()->getFlags();
switch (Opc) {
// TODO: QPX subtarget is deprecated. No transformation here.
if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX())
const TargetOptions &Options = getTargetMachine().Options;
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
SDLoc Loc(Op);
NegatibleCost N2Cost = NegatibleCost::Expensive;
SDValue NegN2 =
getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1);
if (!NegN2)
return SDValue();
// (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
// (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
// These transformations may change sign of zeroes. For example,
// -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {
// Try and choose the cheaper one to negate.
NegatibleCost N0Cost = NegatibleCost::Expensive;
SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize,
N0Cost, Depth + 1);
NegatibleCost N1Cost = NegatibleCost::Expensive;
SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize,
N1Cost, Depth + 1);
if (NegN0 && N0Cost <= N1Cost) {
Cost = std::min(N0Cost, N2Cost);
return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags);
} else if (NegN1) {
Cost = std::min(N1Cost, N2Cost);
return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags);
// (fneg (fnmsub a b c)) => (fma a b (fneg c))
if (isOperationLegal(ISD::FMA, VT)) {
Cost = N2Cost;
return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags);
return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
Cost, Depth);
// Override to enable LOAD_STACK_GUARD lowering on Linux.
bool PPCTargetLowering::useLoadStackGuardNode() const {
if (!Subtarget.isTargetLinux())
return TargetLowering::useLoadStackGuardNode();
return true;
// Override to disable global variable loading on Linux.
void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
if (!Subtarget.isTargetLinux())
return TargetLowering::insertSSPDeclarations(M);
bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
if (!VT.isSimple() || !Subtarget.hasVSX())
return false;
switch(VT.getSimpleVT().SimpleTy) {
// For FP types that are currently not supported by PPC backend, return
// false. Examples: f16, f80.
return false;
case MVT::f32:
case MVT::f64:
if (Subtarget.hasPrefixInstrs()) {
// With prefixed instructions, we can materialize anything that can be
// represented with a 32-bit immediate, not just positive zero.
APFloat APFloatOfImm = Imm;
return convertToNonDenormSingle(APFloatOfImm);
case MVT::ppcf128:
return Imm.isPosZero();
// For vector shift operation op, fold
// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
unsigned Opcode = N->getOpcode();
unsigned TargetOpcode;
switch (Opcode) {
llvm_unreachable("Unexpected shift operation");
case ISD::SHL:
TargetOpcode = PPCISD::SHL;
case ISD::SRL:
TargetOpcode = PPCISD::SRL;
case ISD::SRA:
TargetOpcode = PPCISD::SRA;
if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
N1->getOpcode() == ISD::AND)
if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
if (Mask->getZExtValue() == OpSizeInBits - 1)
return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
return SDValue();
SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
return Value;
SDValue N0 = N->getOperand(0);
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!Subtarget.isISA3_0() ||
N0.getOpcode() != ISD::SIGN_EXTEND ||
N0.getOperand(0).getValueType() != MVT::i32 ||
CN1 == nullptr || N->getValueType(0) != MVT::i64)
return SDValue();
// We can't save an operation here if the value is already extended, and
// the existing shift is easier to combine.
SDValue ExtsSrc = N0.getOperand(0);
if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext)
return SDValue();
SDLoc DL(N0);
SDValue ShiftBy = SDValue(CN1, 0);
// We want the shift amount to be i32 on the extswli, but the shift could
// have an i64.
if (ShiftBy.getValueType() == MVT::i64)
ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32);
return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0),
SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
return Value;
return SDValue();
SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
return Value;
return SDValue();
// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
// When C is zero, the equation (addi Z, -C) can be simplified to Z
// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
const PPCSubtarget &Subtarget) {
if (!Subtarget.isPPC64())
return SDValue();
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
auto isZextOfCompareWithConstant = [](SDValue Op) {
if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() ||
Op.getValueType() != MVT::i64)
return false;
SDValue Cmp = Op.getOperand(0);
if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() ||
Cmp.getOperand(0).getValueType() != MVT::i64)
return false;
if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) {
int64_t NegConstant = 0 - Constant->getSExtValue();
// Due to the limitations of the addi instruction,
// -C is required to be [-32768, 32767].
return isInt<16>(NegConstant);
return false;
bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
bool RHSHasPattern = isZextOfCompareWithConstant(RHS);
// If there is a pattern, canonicalize a zext operand to the RHS.
if (LHSHasPattern && !RHSHasPattern)
std::swap(LHS, RHS);
else if (!LHSHasPattern && !RHSHasPattern)
return SDValue();
SDLoc DL(N);
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue);
SDValue Cmp = RHS.getOperand(0);
SDValue Z = Cmp.getOperand(0);
auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
assert(Constant && "Constant Should not be a null pointer.");
int64_t NegConstant = 0 - Constant->getSExtValue();
switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) {
default: break;
case ISD::SETNE: {
// when C == 0
// --> addze X, (addic Z, -1).carry
// /
// add X, (zext(setne Z, C))--
// \ when -32768 <= -C <= 32767 && C != 0
// --> addze X, (addic (addi Z, -C), -1).carry
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
DAG.getConstant(NegConstant, DL, MVT::i64));
SDValue AddOrZ = NegConstant != 0 ? Add : Z;
SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64));
return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
SDValue(Addc.getNode(), 1));
case ISD::SETEQ: {
// when C == 0
// --> addze X, (subfic Z, 0).carry
// /
// add X, (zext(sete Z, C))--
// \ when -32768 <= -C <= 32767 && C != 0
// --> addze X, (subfic (addi Z, -C), 0).carry
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
DAG.getConstant(NegConstant, DL, MVT::i64));
SDValue AddOrZ = NegConstant != 0 ? Add : Z;
SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
DAG.getConstant(0, DL, MVT::i64), AddOrZ);
return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
SDValue(Subc.getNode(), 1));
return SDValue();
// Transform
// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
// In this case both C1 and C2 must be known constants.
// C1+C2 must fit into a 34 bit signed integer.
static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
const PPCSubtarget &Subtarget) {
if (!Subtarget.isUsingPCRelativeCalls())
return SDValue();
// Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
// If we find that node try to cast the Global Address and the Constant.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
std::swap(LHS, RHS);
if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
return SDValue();
// Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(LHS.getOperand(0));
ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(RHS);
// Check that both casts succeeded.
if (!GSDN || !ConstNode)
return SDValue();
int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
// The signed int offset needs to fit in 34 bits.
if (!isInt<34>(NewOffset))
return SDValue();
// The new global address is a copy of the old global address except
// that it has the updated Offset.
SDValue GA =
DAG.getTargetGlobalAddress(GSDN->getGlobal(), DL, GSDN->getValueType(0),
NewOffset, GSDN->getTargetFlags());
SDValue MatPCRel =
DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, GSDN->getValueType(0), GA);
return MatPCRel;
SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
return Value;
if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
return Value;
return SDValue();
// Detect TRUNCATE operations on bitcasts of float128 values.
// What we are looking for here is the situtation where we extract a subset
// of bits from a 128 bit float.
// This can be of two forms:
// 1) BITCAST of f128 feeding TRUNCATE
// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
// The reason this is required is because we do not have a legal i128 type
// and so we want to prevent having to store the f128 and then reload part
// of it.
SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
DAGCombinerInfo &DCI) const {
// If we are using CRBits then try that first.
if (Subtarget.useCRBits()) {
// Check if CRBits did anything and return that if it did.
if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
return CRTruncValue;
SDLoc dl(N);
SDValue Op0 = N->getOperand(0);
// fold (truncate (abs (sub (zext a), (zext b)))) -> (vabsd a, b)
if (Subtarget.hasP9Altivec() && Op0.getOpcode() == ISD::ABS) {
EVT VT = N->getValueType(0);
if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
return SDValue();
SDValue Sub = Op0.getOperand(0);
if (Sub.getOpcode() == ISD::SUB) {
SDValue SubOp0 = Sub.getOperand(0);
SDValue SubOp1 = Sub.getOperand(1);
if ((SubOp0.getOpcode() == ISD::ZERO_EXTEND) &&
(SubOp1.getOpcode() == ISD::ZERO_EXTEND)) {
return DCI.DAG.getNode(PPCISD::VABSD, dl, VT, SubOp0.getOperand(0),
DCI.DAG.getTargetConstant(0, dl, MVT::i32));
// Looking for a truncate of i128 to i64.
if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
return SDValue();
int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
// SRL feeding TRUNCATE.
if (Op0.getOpcode() == ISD::SRL) {
ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
// The right shift has to be by 64 bits.
if (!ConstNode || ConstNode->getZExtValue() != 64)
return SDValue();
// Switch the element number to extract.
EltToExtract = EltToExtract ? 0 : 1;
// Update Op0 past the SRL.
Op0 = Op0.getOperand(0);
// BITCAST feeding a TRUNCATE possibly via SRL.
if (Op0.getOpcode() == ISD::BITCAST &&
Op0.getValueType() == MVT::i128 &&
Op0.getOperand(0).getValueType() == MVT::f128) {
SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));
return DCI.DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,
DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));
return SDValue();
SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));
if (!ConstOpOrElement)
return SDValue();
// An imul is usually smaller than the alternative sequence for legal type.
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
isOperationLegal(ISD::MUL, N->getValueType(0)))
return SDValue();
auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
switch (this->Subtarget.getCPUDirective()) {
// TODO: enhance the condition for subtarget before pwr8
return false;
case PPC::DIR_PWR8:
// type mul add shl
// scalar 4 1 1
// vector 7 2 2
return true;
case PPC::DIR_PWR9:
case PPC::DIR_PWR10:
// type mul add shl
// scalar 5 2 2
// vector 7 2 2
// The cycle RATIO of related operations are showed as a table above.
// Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
// scalar and vector type. For 2 instrs patterns, add/sub + shl
// are 4, it is always profitable; but for 3 instrs patterns
// (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
// So we should only do it for vector type.
return IsAddOne && IsNeg ? VT.isVector() : true;
EVT VT = N->getValueType(0);
SDLoc DL(N);
const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
bool IsNeg = MulAmt.isNegative();
APInt MulAmtAbs = MulAmt.abs();
if ((MulAmtAbs - 1).isPowerOf2()) {
// (mul x, 2^N + 1) => (add (shl x, N), x)
// (mul x, -(2^N + 1)) => -(add (shl x, N), x)
if (!IsProfitable(IsNeg, true, VT))
return SDValue();
SDValue Op0 = N->getOperand(0);
SDValue Op1 =
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));
SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
if (!IsNeg)
return Res;
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
} else if ((MulAmtAbs + 1).isPowerOf2()) {
// (mul x, 2^N - 1) => (sub (shl x, N), x)
// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
if (!IsProfitable(IsNeg, false, VT))
return SDValue();
SDValue Op0 = N->getOperand(0);
SDValue Op1 =
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));
if (!IsNeg)
return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);
return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
} else {
return SDValue();
// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
// in combiner since we need to check SD flags and other subtarget features.
SDValue PPCTargetLowering::combineFMALike(SDNode *N,
DAGCombinerInfo &DCI) const {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue N2 = N->getOperand(2);
SDNodeFlags Flags = N->getFlags();
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
const TargetOptions &Options = getTargetMachine().Options;
unsigned Opc = N->getOpcode();
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool LegalOps = !DCI.isBeforeLegalizeOps();
SDLoc Loc(N);
// TODO: QPX subtarget is deprecated. No transformation here.
if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT))
return SDValue();
// Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
// since (fnmsub a b c)=-0 while c-ab=+0.
if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)
return SDValue();
// (fma (fneg a) b c) => (fnmsub a b c)
// (fnmsub (fneg a) b c) => (fma a b c)
if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize))
return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags);
// (fma a (fneg b) c) => (fnmsub a b c)
// (fnmsub a (fneg b) c) => (fma a b c)
if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize))
return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags);
return SDValue();
bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
if (!Subtarget.is64BitELFABI())
return false;
// If not a tail call then no need to proceed.
if (!CI->isTailCall())
return false;
// If sibling calls have been disabled and tail-calls aren't guaranteed
// there is no reason to duplicate.
auto &TM = getTargetMachine();
if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
return false;
// Can't tail call a function called indirectly, or if it has variadic args.
const Function *Callee = CI->getCalledFunction();
if (!Callee || Callee->isVarArg())
return false;
// Make sure the callee and caller calling conventions are eligible for tco.
const Function *Caller = CI->getParent()->getParent();
if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
return false;
// If the function is local then we have a good chance at tail-calling it
return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
if (!Subtarget.hasVSX())
return false;
if (Subtarget.hasP9Vector() && VT == MVT::f128)
return true;
return VT == MVT::f32 || VT == MVT::f64 ||
VT == MVT::v4f32 || VT == MVT::v2f64;
bool PPCTargetLowering::
isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
const Value *Mask = AndI.getOperand(1);
// If the mask is suitable for andi. or andis. we should sink the and.
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
// Can't handle constants wider than 64-bits.
if (CI->getBitWidth() > 64)
return false;
int64_t ConstVal = CI->getZExtValue();
return isUInt<16>(ConstVal) ||
(isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
// For non-constant masks, we can always use the record-form and.
return true;
// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0)
// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0)
// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0)
// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0)
// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32
SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const {
assert((N->getOpcode() == ISD::ABS) && "Need ABS node here");
assert(Subtarget.hasP9Altivec() &&
"Only combine this when P9 altivec supported!");
EVT VT = N->getValueType(0);
if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
if (N->getOperand(0).getOpcode() == ISD::SUB) {
// Even for signed integers, if it's known to be positive (as signed
// integer) due to zero-extended inputs.
unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode();
unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode();
if ((SubOpcd0 == ISD::ZERO_EXTEND ||
(SubOpcd1 == ISD::ZERO_EXTEND ||
return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
DAG.getTargetConstant(0, dl, MVT::i32));
// For type v4i32, it can be optimized with xvnegsp + vabsduw
if (N->getOperand(0).getValueType() == MVT::v4i32 &&
N->getOperand(0).hasOneUse()) {
return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
DAG.getTargetConstant(1, dl, MVT::i32));
return SDValue();
// For type v4i32/v8ii16/v16i8, transform
// from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b)
// from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b)
// from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b)
// from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b)
SDValue PPCTargetLowering::combineVSelect(SDNode *N,
DAGCombinerInfo &DCI) const {
assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here");
assert(Subtarget.hasP9Altivec() &&
"Only combine this when P9 altivec supported!");
SelectionDAG &DAG = DCI.DAG;
SDLoc dl(N);
SDValue Cond = N->getOperand(0);
SDValue TrueOpnd = N->getOperand(1);
SDValue FalseOpnd = N->getOperand(2);
EVT VT = N->getOperand(1).getValueType();
if (Cond.getOpcode() != ISD::SETCC || TrueOpnd.getOpcode() != ISD::SUB ||
FalseOpnd.getOpcode() != ISD::SUB)
return SDValue();
// ABSD only available for type v4i32/v8i16/v16i8
if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
return SDValue();
// At least to save one more dependent computation
if (!(Cond.hasOneUse() || TrueOpnd.hasOneUse() || FalseOpnd.hasOneUse()))
return SDValue();
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
// Can only handle unsigned comparison here
switch (CC) {
return SDValue();
std::swap(TrueOpnd, FalseOpnd);
SDValue CmpOpnd1 = Cond.getOperand(0);
SDValue CmpOpnd2 = Cond.getOperand(1);
// SETCC CmpOpnd1 CmpOpnd2 cond
// TrueOpnd = CmpOpnd1 - CmpOpnd2
// FalseOpnd = CmpOpnd2 - CmpOpnd1
if (TrueOpnd.getOperand(0) == CmpOpnd1 &&
TrueOpnd.getOperand(1) == CmpOpnd2 &&
FalseOpnd.getOperand(0) == CmpOpnd2 &&
FalseOpnd.getOperand(1) == CmpOpnd1) {
return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(),
CmpOpnd1, CmpOpnd2,
DAG.getTargetConstant(0, dl, MVT::i32));
return SDValue();
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index a0ae05081adc..7570385e38e3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1,302 +1,626 @@
//===-- RISCVISelDAGToDAG.cpp - A dag to dag inst selector for RISCV ------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file defines an instruction selector for the RISCV target.
#include "RISCVISelDAGToDAG.h"
#include "MCTargetDesc/RISCVMCTargetDesc.h"
#include "Utils/RISCVMatInt.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "riscv-isel"
void RISCVDAGToDAGISel::PostprocessISelDAG() {
static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
RISCVMatInt::InstSeq Seq;
RISCVMatInt::generateInstSeq(Imm, XLenVT == MVT::i64, Seq);
SDNode *Result = nullptr;
SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
for (RISCVMatInt::Inst &Inst : Seq) {
SDValue SDImm = CurDAG->getTargetConstant(Inst.Imm, DL, XLenVT);
if (Inst.Opc == RISCV::LUI)
Result = CurDAG->getMachineNode(RISCV::LUI, DL, XLenVT, SDImm);
Result = CurDAG->getMachineNode(Inst.Opc, DL, XLenVT, SrcReg, SDImm);
// Only the first instruction has X0 as its source.
SrcReg = SDValue(Result, 0);
return Result;
// Returns true if the Node is an ISD::AND with a constant argument. If so,
// set Mask to that constant value.
static bool isConstantMask(SDNode *Node, uint64_t &Mask) {
if (Node->getOpcode() == ISD::AND &&
Node->getOperand(1).getOpcode() == ISD::Constant) {
Mask = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
return true;
return false;
void RISCVDAGToDAGISel::Select(SDNode *Node) {
// If we have a custom node, we have already selected.
if (Node->isMachineOpcode()) {
LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
// Instruction Selection not handled by the auto-generated tablegen selection
// should be handled here.
unsigned Opcode = Node->getOpcode();
MVT XLenVT = Subtarget->getXLenVT();
SDLoc DL(Node);
EVT VT = Node->getValueType(0);
switch (Opcode) {
case ISD::ADD: {
// Optimize (add r, imm) to (addi (addi r, imm0) imm1) if applicable. The
// immediate must be in specific ranges and have a single use.
if (auto *ConstOp = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
if (!(ConstOp->hasOneUse()))
// The imm must be in range [-4096,-2049] or [2048,4094].
int64_t Imm = ConstOp->getSExtValue();
if (!(-4096 <= Imm && Imm <= -2049) && !(2048 <= Imm && Imm <= 4094))
// Break the imm to imm0+imm1.
SDLoc DL(Node);
EVT VT = Node->getValueType(0);
const SDValue ImmOp0 = CurDAG->getTargetConstant(Imm - Imm / 2, DL, VT);
const SDValue ImmOp1 = CurDAG->getTargetConstant(Imm / 2, DL, VT);
auto *NodeAddi0 = CurDAG->getMachineNode(RISCV::ADDI, DL, VT,
Node->getOperand(0), ImmOp0);
auto *NodeAddi1 = CurDAG->getMachineNode(RISCV::ADDI, DL, VT,
SDValue(NodeAddi0, 0), ImmOp1);
ReplaceNode(Node, NodeAddi1);
case ISD::Constant: {
auto ConstNode = cast<ConstantSDNode>(Node);
if (VT == XLenVT && ConstNode->isNullValue()) {
SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
ReplaceNode(Node, New.getNode());
int64_t Imm = ConstNode->getSExtValue();
if (XLenVT == MVT::i64) {
ReplaceNode(Node, selectImm(CurDAG, SDLoc(Node), Imm, XLenVT));
case ISD::FrameIndex: {
SDValue Imm = CurDAG->getTargetConstant(0, DL, XLenVT);
int FI = cast<FrameIndexSDNode>(Node)->getIndex();
SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ADDI, DL, VT, TFI, Imm));
case ISD::SRL: {
if (!Subtarget->is64Bit())
SDValue Op0 = Node->getOperand(0);
SDValue Op1 = Node->getOperand(1);
uint64_t Mask;
// Match (srl (and val, mask), imm) where the result would be a
// zero-extended 32-bit integer. i.e. the mask is 0xffffffff or the result
// is equivalent to this (SimplifyDemandedBits may have removed lower bits
// from the mask that aren't necessary due to the right-shifting).
if (Op1.getOpcode() == ISD::Constant &&
isConstantMask(Op0.getNode(), Mask)) {
uint64_t ShAmt = cast<ConstantSDNode>(Op1.getNode())->getZExtValue();
if ((Mask | maskTrailingOnes<uint64_t>(ShAmt)) == 0xffffffff) {
SDValue ShAmtVal =
CurDAG->getTargetConstant(ShAmt, SDLoc(Node), XLenVT);
CurDAG->SelectNodeTo(Node, RISCV::SRLIW, XLenVT, Op0.getOperand(0),
assert(!Subtarget->is64Bit() && "READ_CYCLE_WIDE is only used on riscv32");
ReplaceNode(Node, CurDAG->getMachineNode(RISCV::ReadCycleWide, DL, MVT::i32,
MVT::i32, MVT::Other,
// Select the default instruction.
bool RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand(
const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
switch (ConstraintID) {
case InlineAsm::Constraint_m:
// We just support simple memory operands that have a single address
// operand and need no special handling.
return false;
case InlineAsm::Constraint_A:
return false;
return true;
bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
if (auto FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), Subtarget->getXLenVT());
return true;
return false;
+// Check that it is a SLOI (Shift Left Ones Immediate). We first check that
+// it is the right node tree:
+// (OR (SHL RS1, VC2), VC1)
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+// VC1 == maskTrailingOnes<uint64_t>(VC2)
+bool RISCVDAGToDAGISel::SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt) {
+ MVT XLenVT = Subtarget->getXLenVT();
+ if (N.getOpcode() == ISD::OR) {
+ SDValue Or = N;
+ if (Or.getOperand(0).getOpcode() == ISD::SHL) {
+ SDValue Shl = Or.getOperand(0);
+ if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
+ isa<ConstantSDNode>(Or.getOperand(1))) {
+ if (XLenVT == MVT::i64) {
+ uint64_t VC1 = Or.getConstantOperandVal(1);
+ uint64_t VC2 = Shl.getConstantOperandVal(1);
+ if (VC1 == maskTrailingOnes<uint64_t>(VC2)) {
+ RS1 = Shl.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+ Shl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ if (XLenVT == MVT::i32) {
+ uint32_t VC1 = Or.getConstantOperandVal(1);
+ uint32_t VC2 = Shl.getConstantOperandVal(1);
+ if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
+ RS1 = Shl.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+ Shl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ }
+ }
+ }
+ return false;
+// Check that it is a SROI (Shift Right Ones Immediate). We first check that
+// it is the right node tree:
+// (OR (SRL RS1, VC2), VC1)
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+// VC1 == maskLeadingOnes<uint64_t>(VC2)
+bool RISCVDAGToDAGISel::SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt) {
+ MVT XLenVT = Subtarget->getXLenVT();
+ if (N.getOpcode() == ISD::OR) {
+ SDValue Or = N;
+ if (Or.getOperand(0).getOpcode() == ISD::SRL) {
+ SDValue Srl = Or.getOperand(0);
+ if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
+ isa<ConstantSDNode>(Or.getOperand(1))) {
+ if (XLenVT == MVT::i64) {
+ uint64_t VC1 = Or.getConstantOperandVal(1);
+ uint64_t VC2 = Srl.getConstantOperandVal(1);
+ if (VC1 == maskLeadingOnes<uint64_t>(VC2)) {
+ RS1 = Srl.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+ Srl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ if (XLenVT == MVT::i32) {
+ uint32_t VC1 = Or.getConstantOperandVal(1);
+ uint32_t VC2 = Srl.getConstantOperandVal(1);
+ if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
+ RS1 = Srl.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+ Srl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ }
+ }
+ }
+ return false;
+// Check that it is a RORI (Rotate Right Immediate). We first check that
+// it is the right node tree:
+// (ROTL RS1, VC)
+// The compiler translates immediate rotations to the right given by the call
+// to the rotateright32/rotateright64 intrinsics as rotations to the left.
+// Since the rotation to the left can be easily emulated as a rotation to the
+// right by negating the constant, there is no encoding for ROLI.
+// We then select the immediate left rotations as RORI by the complementary
+// constant:
+// Shamt == XLen - VC
+bool RISCVDAGToDAGISel::SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt) {
+ MVT XLenVT = Subtarget->getXLenVT();
+ if (N.getOpcode() == ISD::ROTL) {
+ if (isa<ConstantSDNode>(N.getOperand(1))) {
+ if (XLenVT == MVT::i64) {
+ uint64_t VC = N.getConstantOperandVal(1);
+ Shamt = CurDAG->getTargetConstant((64 - VC), SDLoc(N),
+ N.getOperand(1).getValueType());
+ RS1 = N.getOperand(0);
+ return true;
+ }
+ if (XLenVT == MVT::i32) {
+ uint32_t VC = N.getConstantOperandVal(1);
+ Shamt = CurDAG->getTargetConstant((32 - VC), SDLoc(N),
+ N.getOperand(1).getValueType());
+ RS1 = N.getOperand(0);
+ return true;
+ }
+ }
+ }
+ return false;
+// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
+// on RV64).
+// SLLIUW is the same as SLLI except for the fact that it clears the bits
+// XLEN-1:32 of the input RS1 before shifting.
+// We first check that it is the right node tree:
+// (AND (SHL RS1, VC2), VC1)
+// We check that VC2, the shamt is less than 32, otherwise the pattern is
+// exactly the same as SLLI and we give priority to that.
+// Eventually we check that that VC1, the mask used to clear the upper 32 bits
+// of RS1, is correct:
+// VC1 == (0xFFFFFFFF << VC2)
+bool RISCVDAGToDAGISel::SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt) {
+ if (N.getOpcode() == ISD::AND && Subtarget->getXLenVT() == MVT::i64) {
+ SDValue And = N;
+ if (And.getOperand(0).getOpcode() == ISD::SHL) {
+ SDValue Shl = And.getOperand(0);
+ if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
+ isa<ConstantSDNode>(And.getOperand(1))) {
+ uint64_t VC1 = And.getConstantOperandVal(1);
+ uint64_t VC2 = Shl.getConstantOperandVal(1);
+ if (VC2 < 32 && VC1 == ((uint64_t)0xFFFFFFFF << VC2)) {
+ RS1 = Shl.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+ Shl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+// Check that it is a SLOIW (Shift Left Ones Immediate i32 on RV64).
+// We first check that it is the right node tree:
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+// VC1 == maskTrailingOnes<uint32_t>(VC2)
+bool RISCVDAGToDAGISel::SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
+ if (Subtarget->getXLenVT() == MVT::i64 &&
+ N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+ cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
+ if (N.getOperand(0).getOpcode() == ISD::OR) {
+ SDValue Or = N.getOperand(0);
+ if (Or.getOperand(0).getOpcode() == ISD::SHL) {
+ SDValue Shl = Or.getOperand(0);
+ if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
+ isa<ConstantSDNode>(Or.getOperand(1))) {
+ uint32_t VC1 = Or.getConstantOperandVal(1);
+ uint32_t VC2 = Shl.getConstantOperandVal(1);
+ if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
+ RS1 = Shl.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+ Shl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ }
+ }
+ }
+ return false;
+// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64).
+// We first check that it is the right node tree:
+// (OR (SHL RS1, VC2), VC1)
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+// VC1 == maskLeadingOnes<uint32_t>(VC2)
+bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
+ if (N.getOpcode() == ISD::OR && Subtarget->getXLenVT() == MVT::i64) {
+ SDValue Or = N;
+ if (Or.getOperand(0).getOpcode() == ISD::SRL) {
+ SDValue Srl = Or.getOperand(0);
+ if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
+ isa<ConstantSDNode>(Or.getOperand(1))) {
+ uint32_t VC1 = Or.getConstantOperandVal(1);
+ uint32_t VC2 = Srl.getConstantOperandVal(1);
+ if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
+ RS1 = Srl.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+ Srl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+// Check that it is a RORIW (i32 Right Rotate Immediate on RV64).
+// We first check that it is the right node tree:
+// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
+// (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
+// Then we check that the constant operands respect these constraints:
+// VC2 == 32 - VC1
+// VC3 == maskLeadingOnes<uint32_t>(VC2)
+// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
+// and VC3 a 32 bit mask of (32 - VC1) leading ones.
+bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
+ if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+ Subtarget->getXLenVT() == MVT::i64 &&
+ cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
+ if (N.getOperand(0).getOpcode() == ISD::OR) {
+ SDValue Or = N.getOperand(0);
+ if (Or.getOperand(0).getOpcode() == ISD::SHL &&
+ Or.getOperand(1).getOpcode() == ISD::SRL) {
+ SDValue Shl = Or.getOperand(0);
+ SDValue Srl = Or.getOperand(1);
+ if (Srl.getOperand(0).getOpcode() == ISD::AND) {
+ SDValue And = Srl.getOperand(0);
+ if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
+ isa<ConstantSDNode>(Shl.getOperand(1)) &&
+ isa<ConstantSDNode>(And.getOperand(1))) {
+ uint32_t VC1 = Srl.getConstantOperandVal(1);
+ uint32_t VC2 = Shl.getConstantOperandVal(1);
+ uint32_t VC3 = And.getConstantOperandVal(1);
+ if (VC2 == (32 - VC1) &&
+ VC3 == maskLeadingOnes<uint32_t>(VC2)) {
+ RS1 = Shl.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
+ Srl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ }
+ }
+ }
+ }
+ return false;
+// Check that it is a FSRIW (i32 Funnel Shift Right Immediate on RV64).
+// We first check that it is the right node tree:
+// (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
+// (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
+// Then we check that the constant operands respect these constraints:
+// VC2 == 32 - VC1
+// VC3 == maskLeadingOnes<uint32_t>(VC2)
+// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
+// and VC3 a 32 bit mask of (32 - VC1) leading ones.
+bool RISCVDAGToDAGISel::SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2,
+ SDValue &Shamt) {
+ if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+ Subtarget->getXLenVT() == MVT::i64 &&
+ cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
+ if (N.getOperand(0).getOpcode() == ISD::OR) {
+ SDValue Or = N.getOperand(0);
+ if (Or.getOperand(0).getOpcode() == ISD::SHL &&
+ Or.getOperand(1).getOpcode() == ISD::SRL) {
+ SDValue Shl = Or.getOperand(0);
+ SDValue Srl = Or.getOperand(1);
+ if (Srl.getOperand(0).getOpcode() == ISD::AND) {
+ SDValue And = Srl.getOperand(0);
+ if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
+ isa<ConstantSDNode>(Shl.getOperand(1)) &&
+ isa<ConstantSDNode>(And.getOperand(1))) {
+ uint32_t VC1 = Srl.getConstantOperandVal(1);
+ uint32_t VC2 = Shl.getConstantOperandVal(1);
+ uint32_t VC3 = And.getConstantOperandVal(1);
+ if (VC2 == (32 - VC1) &&
+ VC3 == maskLeadingOnes<uint32_t>(VC2)) {
+ RS1 = Shl.getOperand(0);
+ RS2 = And.getOperand(0);
+ Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
+ Srl.getOperand(1).getValueType());
+ return true;
+ }
+ }
+ }
+ }
+ }
+ }
+ return false;
// Merge an ADDI into the offset of a load/store instruction where possible.
// (load (addi base, off1), off2) -> (load base, off1+off2)
// (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
// This is possible when off1+off2 fits a 12-bit immediate.
void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
while (Position != CurDAG->allnodes_begin()) {
SDNode *N = &*--Position;
// Skip dead nodes and any non-machine opcodes.
if (N->use_empty() || !N->isMachineOpcode())
int OffsetOpIdx;
int BaseOpIdx;
// Only attempt this optimisation for I-type loads and S-type stores.
switch (N->getMachineOpcode()) {
case RISCV::LB:
case RISCV::LH:
case RISCV::LW:
case RISCV::LBU:
case RISCV::LHU:
case RISCV::LWU:
case RISCV::LD:
case RISCV::FLW:
case RISCV::FLD:
BaseOpIdx = 0;
OffsetOpIdx = 1;
case RISCV::SB:
case RISCV::SH:
case RISCV::SW:
case RISCV::SD:
case RISCV::FSW:
case RISCV::FSD:
BaseOpIdx = 1;
OffsetOpIdx = 2;
if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
SDValue Base = N->getOperand(BaseOpIdx);
// If the base is an ADDI, we can merge it in to the load/store.
if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
SDValue ImmOperand = Base.getOperand(1);
uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
if (auto Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
int64_t Offset1 = Const->getSExtValue();
int64_t CombinedOffset = Offset1 + Offset2;
if (!isInt<12>(CombinedOffset))
ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
} else if (auto GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
// If the off1 in (addi base, off1) is a global variable's address (its
// low part, really), then we can rely on the alignment of that variable
// to provide a margin of safety before off1 can overflow the 12 bits.
// Check if off2 falls within that margin; if so off1+off2 can't overflow.
const DataLayout &DL = CurDAG->getDataLayout();
Align Alignment = GA->getGlobal()->getPointerAlignment(DL);
if (Offset2 != 0 && Alignment <= Offset2)
int64_t Offset1 = GA->getOffset();
int64_t CombinedOffset = Offset1 + Offset2;
ImmOperand = CurDAG->getTargetGlobalAddress(
GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
CombinedOffset, GA->getTargetFlags());
} else if (auto CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
// Ditto.
Align Alignment = CP->getAlign();
if (Offset2 != 0 && Alignment <= Offset2)
int64_t Offset1 = CP->getOffset();
int64_t CombinedOffset = Offset1 + Offset2;
ImmOperand = CurDAG->getTargetConstantPool(
CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(),
CombinedOffset, CP->getTargetFlags());
} else {
LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
LLVM_DEBUG(dbgs() << "\nN: ");
LLVM_DEBUG(dbgs() << "\n");
// Modify the offset operand of the load/store.
if (BaseOpIdx == 0) // Load
CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
else // Store
CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
ImmOperand, N->getOperand(3));
// The add-immediate may now be dead, in which case remove it.
if (Base.getNode()->use_empty())
// This pass converts a legalized DAG into a RISCV-specific DAG, ready
// for instruction scheduling.
FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM) {
return new RISCVDAGToDAGISel(TM);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index dcf733ec3675..0ca12510a230 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -1,56 +1,65 @@
//===---- RISCVISelDAGToDAG.h - A dag to dag inst selector for RISCV ------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file defines an instruction selector for the RISCV target.
#include "RISCV.h"
#include "RISCVTargetMachine.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
// RISCV-specific code to select RISCV machine instructions for
// SelectionDAG operations.
namespace llvm {
class RISCVDAGToDAGISel : public SelectionDAGISel {
const RISCVSubtarget *Subtarget = nullptr;
explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine)
: SelectionDAGISel(TargetMachine) {}
StringRef getPassName() const override {
return "RISCV DAG->DAG Pattern Instruction Selection";
bool runOnMachineFunction(MachineFunction &MF) override {
Subtarget = &MF.getSubtarget<RISCVSubtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
void PostprocessISelDAG() override;
void Select(SDNode *Node) override;
bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
std::vector<SDValue> &OutOps) override;
bool SelectAddrFI(SDValue Addr, SDValue &Base);
+ bool SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt);
+ bool SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt);
+ bool SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt);
+ bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt);
+ bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
+ bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
+ bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt);
+ bool SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, SDValue &Shamt);
// Include the pieces autogenerated from the target description.
#include ""
void doPeepholeLoadStoreADDI();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 91fc69b5bc10..03d9eefd59d0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1,3019 +1,3034 @@
//===-- RISCVISelLowering.cpp - RISCV DAG Lowering Implementation --------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file defines the interfaces that RISCV uses to lower LLVM code into a
// selection DAG.
#include "RISCVISelLowering.h"
#include "RISCV.h"
#include "RISCVMachineFunctionInfo.h"
#include "RISCVRegisterInfo.h"
#include "RISCVSubtarget.h"
#include "RISCVTargetMachine.h"
#include "Utils/RISCVMatInt.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "riscv-lower"
STATISTIC(NumTailCalls, "Number of tail calls");
RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
const RISCVSubtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
if (Subtarget.isRV32E())
report_fatal_error("Codegen not yet implemented for RV32E");
RISCVABI::ABI ABI = Subtarget.getTargetABI();
assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");
!Subtarget.hasStdExtF()) {
errs() << "Hard-float 'f' ABI can't be used for a target that "
"doesn't support the F instruction set extension (ignoring "
ABI = Subtarget.is64Bit() ? RISCVABI::ABI_LP64 : RISCVABI::ABI_ILP32;
} else if ((ABI == RISCVABI::ABI_ILP32D || ABI == RISCVABI::ABI_LP64D) &&
!Subtarget.hasStdExtD()) {
errs() << "Hard-float 'd' ABI can't be used for a target that "
"doesn't support the D instruction set extension (ignoring "
ABI = Subtarget.is64Bit() ? RISCVABI::ABI_LP64 : RISCVABI::ABI_ILP32;
switch (ABI) {
report_fatal_error("Don't know how to lower this ABI");
MVT XLenVT = Subtarget.getXLenVT();
// Set up the register classes.
addRegisterClass(XLenVT, &RISCV::GPRRegClass);
if (Subtarget.hasStdExtF())
addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
if (Subtarget.hasStdExtD())
addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);
// Compute derived properties from the register classes.
setLoadExtAction(N, XLenVT, MVT::i1, Promote);
// TODO: add all necessary setOperationAction calls.
setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, XLenVT, Expand);
setOperationAction(ISD::SELECT, XLenVT, Custom);
setOperationAction(ISD::SELECT_CC, XLenVT, Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::ADD, MVT::i32, Custom);
setOperationAction(ISD::SUB, MVT::i32, Custom);
setOperationAction(ISD::SHL, MVT::i32, Custom);
setOperationAction(ISD::SRA, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i32, Custom);
if (!Subtarget.hasStdExtM()) {
setOperationAction(ISD::MUL, XLenVT, Expand);
setOperationAction(ISD::MULHS, XLenVT, Expand);
setOperationAction(ISD::MULHU, XLenVT, Expand);
setOperationAction(ISD::SDIV, XLenVT, Expand);
setOperationAction(ISD::UDIV, XLenVT, Expand);
setOperationAction(ISD::SREM, XLenVT, Expand);
setOperationAction(ISD::UREM, XLenVT, Expand);
if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
setOperationAction(ISD::MUL, MVT::i32, Custom);
setOperationAction(ISD::SDIV, MVT::i32, Custom);
setOperationAction(ISD::UDIV, MVT::i32, Custom);
setOperationAction(ISD::UREM, MVT::i32, Custom);
setOperationAction(ISD::SDIVREM, XLenVT, Expand);
setOperationAction(ISD::UDIVREM, XLenVT, Expand);
setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);
setOperationAction(ISD::SHL_PARTS, XLenVT, Custom);
setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);
- setOperationAction(ISD::ROTL, XLenVT, Expand);
- setOperationAction(ISD::ROTR, XLenVT, Expand);
- setOperationAction(ISD::BSWAP, XLenVT, Expand);
- setOperationAction(ISD::CTTZ, XLenVT, Expand);
- setOperationAction(ISD::CTLZ, XLenVT, Expand);
- setOperationAction(ISD::CTPOP, XLenVT, Expand);
+ if (!(Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp())) {
+ setOperationAction(ISD::ROTL, XLenVT, Expand);
+ setOperationAction(ISD::ROTR, XLenVT, Expand);
+ }
+ if (!Subtarget.hasStdExtZbp())
+ setOperationAction(ISD::BSWAP, XLenVT, Expand);
+ if (!Subtarget.hasStdExtZbb()) {
+ setOperationAction(ISD::CTTZ, XLenVT, Expand);
+ setOperationAction(ISD::CTLZ, XLenVT, Expand);
+ setOperationAction(ISD::CTPOP, XLenVT, Expand);
+ }
+ if (Subtarget.hasStdExtZbp())
+ setOperationAction(ISD::BITREVERSE, XLenVT, Legal);
+ if (Subtarget.hasStdExtZbt()) {
+ setOperationAction(ISD::FSHL, XLenVT, Legal);
+ setOperationAction(ISD::FSHR, XLenVT, Legal);
+ }
ISD::CondCode FPCCToExtend[] = {
ISD::NodeType FPOpToExtend[] = {
if (Subtarget.hasStdExtF()) {
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
for (auto CC : FPCCToExtend)
setCondCodeAction(CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
for (auto Op : FPOpToExtend)
setOperationAction(Op, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
if (Subtarget.hasStdExtF() && Subtarget.is64Bit())
setOperationAction(ISD::BITCAST, MVT::i32, Custom);
if (Subtarget.hasStdExtD()) {
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
for (auto CC : FPCCToExtend)
setCondCodeAction(CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
setOperationAction(ISD::SELECT, MVT::f64, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
for (auto Op : FPOpToExtend)
setOperationAction(Op, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
if (Subtarget.is64Bit() &&
!(Subtarget.hasStdExtD() || Subtarget.hasStdExtF())) {
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
setOperationAction(ISD::BlockAddress, XLenVT, Custom);
setOperationAction(ISD::ConstantPool, XLenVT, Custom);
setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);
// TODO: On M-mode only targets, the cycle[h] CSR may not be present.
// Unfortunately this can't be determined just from the ISA naming string.
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
Subtarget.is64Bit() ? Legal : Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
if (Subtarget.hasStdExtA()) {
} else {
// Function alignments.
const Align FunctionAlignment(Subtarget.hasStdExtC() ? 2 : 4);
// Effectively disable jump table generation.
// Jumps are expensive, compared to logic
// We can use any register for comparisons
EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return getPointerTy(DL);
return VT.changeVectorElementTypeToInteger();
bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
switch (Intrinsic) {
return false;
case Intrinsic::riscv_masked_atomicrmw_xchg_i32:
case Intrinsic::riscv_masked_atomicrmw_add_i32:
case Intrinsic::riscv_masked_atomicrmw_sub_i32:
case Intrinsic::riscv_masked_atomicrmw_nand_i32:
case Intrinsic::riscv_masked_atomicrmw_max_i32:
case Intrinsic::riscv_masked_atomicrmw_min_i32:
case Intrinsic::riscv_masked_atomicrmw_umax_i32:
case Intrinsic::riscv_masked_atomicrmw_umin_i32:
case Intrinsic::riscv_masked_cmpxchg_i32:
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = Align(4);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
return true;
bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS,
Instruction *I) const {
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
// Require a 12-bit signed offset.
if (!isInt<12>(AM.BaseOffs))
return false;
switch (AM.Scale) {
case 0: // "r+i" or just "i", depending on HasBaseReg.
case 1:
if (!AM.HasBaseReg) // allow "r+i".
return false; // disallow "r+r" or "r+r+i".
return false;
return true;
bool RISCVTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return isInt<12>(Imm);
bool RISCVTargetLowering::isLegalAddImmediate(int64_t Imm) const {
return isInt<12>(Imm);
// On RV32, 64-bit integers are split into their high and low parts and held
// in two different registers, so the trunc is free since the low register can
// just be used.
bool RISCVTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
if (Subtarget.is64Bit() || !SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
return false;
unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
unsigned DestBits = DstTy->getPrimitiveSizeInBits();
return (SrcBits == 64 && DestBits == 32);
bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
if (Subtarget.is64Bit() || SrcVT.isVector() || DstVT.isVector() ||
!SrcVT.isInteger() || !DstVT.isInteger())
return false;
unsigned SrcBits = SrcVT.getSizeInBits();
unsigned DestBits = DstVT.getSizeInBits();
return (SrcBits == 64 && DestBits == 32);
bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
// Zexts are free if they can be combined with a load.
if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
EVT MemVT = LD->getMemoryVT();
if ((MemVT == MVT::i8 || MemVT == MVT::i16 ||
(Subtarget.is64Bit() && MemVT == MVT::i32)) &&
(LD->getExtensionType() == ISD::NON_EXTLOAD ||
LD->getExtensionType() == ISD::ZEXTLOAD))
return true;
return TargetLowering::isZExtFree(Val, VT2);
bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
if (VT == MVT::f32 && !Subtarget.hasStdExtF())
return false;
if (VT == MVT::f64 && !Subtarget.hasStdExtD())
return false;
if (Imm.isNegZero())
return false;
return Imm.isZero();
bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
return (VT == MVT::f32 && Subtarget.hasStdExtF()) ||
(VT == MVT::f64 && Subtarget.hasStdExtD());
// Changes the condition code and swaps operands if necessary, so the SetCC
// operation matches one of the comparisons supported directly in the RISC-V
// ISA.
static void normaliseSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
switch (CC) {
case ISD::SETGT:
case ISD::SETLE:
CC = ISD::getSetCCSwappedOperands(CC);
std::swap(LHS, RHS);
// Return the RISC-V branch opcode that matches the given DAG integer
// condition code. The CondCode must be one of those supported by the RISC-V
// ISA (see normaliseSetCC).
static unsigned getBranchOpcodeForIntCondCode(ISD::CondCode CC) {
switch (CC) {
llvm_unreachable("Unsupported CondCode");
case ISD::SETEQ:
return RISCV::BEQ;
case ISD::SETNE:
return RISCV::BNE;
case ISD::SETLT:
return RISCV::BLT;
case ISD::SETGE:
return RISCV::BGE;
return RISCV::BLTU;
return RISCV::BGEU;
SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
report_fatal_error("unimplemented operand");
case ISD::GlobalAddress:
return lowerGlobalAddress(Op, DAG);
case ISD::BlockAddress:
return lowerBlockAddress(Op, DAG);
case ISD::ConstantPool:
return lowerConstantPool(Op, DAG);
case ISD::GlobalTLSAddress:
return lowerGlobalTLSAddress(Op, DAG);
return lowerSELECT(Op, DAG);
return lowerVASTART(Op, DAG);
return lowerFRAMEADDR(Op, DAG);
return lowerRETURNADDR(Op, DAG);
return lowerShiftLeftParts(Op, DAG);
return lowerShiftRightParts(Op, DAG, true);
return lowerShiftRightParts(Op, DAG, false);
case ISD::BITCAST: {
assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() &&
"Unexpected custom legalisation");
SDLoc DL(Op);
SDValue Op0 = Op.getOperand(0);
if (Op.getValueType() != MVT::f32 || Op0.getValueType() != MVT::i32)
return SDValue();
SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
return FPConv;
static SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty,
SelectionDAG &DAG, unsigned Flags) {
return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty,
SelectionDAG &DAG, unsigned Flags) {
return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
SelectionDAG &DAG, unsigned Flags) {
return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
N->getOffset(), Flags);
template <class NodeTy>
SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
bool IsLocal) const {
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
if (isPositionIndependent()) {
SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
if (IsLocal)
// Use PC-relative addressing to access the symbol. This generates the
// pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym))
// %pcrel_lo(auipc)).
return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
// Use PC-relative addressing to access the GOT for this symbol, then load
// the address from the GOT. This generates the pattern (PseudoLA sym),
// which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))).
return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0);
switch (getTargetMachine().getCodeModel()) {
report_fatal_error("Unsupported code model for lowering");
case CodeModel::Small: {
// Generate a sequence for accessing addresses within the first 2 GiB of
// address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0);
case CodeModel::Medium: {
// Generate a sequence for accessing addresses within any 2GiB range within
// the address space. This generates the pattern (PseudoLLA sym), which
// expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)).
SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT Ty = Op.getValueType();
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
int64_t Offset = N->getOffset();
MVT XLenVT = Subtarget.getXLenVT();
const GlobalValue *GV = N->getGlobal();
bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
SDValue Addr = getAddr(N, DAG, IsLocal);
// In order to maximise the opportunity for common subexpression elimination,
// emit a separate ADD node for the global address offset instead of folding
// it in the global address node. Later peephole optimisations may choose to
// fold it back in when profitable.
if (Offset != 0)
return DAG.getNode(ISD::ADD, DL, Ty, Addr,
DAG.getConstant(Offset, DL, XLenVT));
return Addr;
SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
return getAddr(N, DAG);
SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
return getAddr(N, DAG);
SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
SelectionDAG &DAG,
bool UseGOT) const {
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
const GlobalValue *GV = N->getGlobal();
MVT XLenVT = Subtarget.getXLenVT();
if (UseGOT) {
// Use PC-relative addressing to access the GOT for this TLS symbol, then
// load the address from the GOT and add the thread pointer. This generates
// the pattern (PseudoLA_TLS_IE sym), which expands to
// (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)).
SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
SDValue Load =
SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0);
// Add the thread pointer.
SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg);
// Generate a sequence for accessing the address relative to the thread
// pointer, with the appropriate adjustment for the thread pointer offset.
// This generates the pattern
// (add (add_tprel (lui %tprel_hi(sym)) tp %tprel_add(sym)) %tprel_lo(sym))
SDValue AddrHi =
DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_HI);
SDValue AddrAdd =
DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_ADD);
SDValue AddrLo =
DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO);
SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
SDValue MNAdd = SDValue(
DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd),
return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0);
SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
SelectionDAG &DAG) const {
SDLoc DL(N);
EVT Ty = getPointerTy(DAG.getDataLayout());
IntegerType *CallTy = Type::getIntNTy(*DAG.getContext(), Ty.getSizeInBits());
const GlobalValue *GV = N->getGlobal();
// Use a PC-relative addressing mode to access the global dynamic GOT address.
// This generates the pattern (PseudoLA_TLS_GD sym), which expands to
// (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)).
SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
SDValue Load =
SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0);
// Prepare argument list to generate call.
ArgListTy Args;
ArgListEntry Entry;
Entry.Node = Load;
Entry.Ty = CallTy;
// Setup call to __tls_get_addr.
TargetLowering::CallLoweringInfo CLI(DAG);
.setLibCallee(CallingConv::C, CallTy,
DAG.getExternalSymbol("__tls_get_addr", Ty),
return LowerCallTo(CLI).first;
SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT Ty = Op.getValueType();
GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
int64_t Offset = N->getOffset();
MVT XLenVT = Subtarget.getXLenVT();
TLSModel::Model Model = getTargetMachine().getTLSModel(N->getGlobal());
SDValue Addr;
switch (Model) {
case TLSModel::LocalExec:
Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/false);
case TLSModel::InitialExec:
Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/true);
case TLSModel::LocalDynamic:
case TLSModel::GeneralDynamic:
Addr = getDynamicTLSAddr(N, DAG);
// In order to maximise the opportunity for common subexpression elimination,
// emit a separate ADD node for the global address offset instead of folding
// it in the global address node. Later peephole optimisations may choose to
// fold it back in when profitable.
if (Offset != 0)
return DAG.getNode(ISD::ADD, DL, Ty, Addr,
DAG.getConstant(Offset, DL, XLenVT));
return Addr;
SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue CondV = Op.getOperand(0);
SDValue TrueV = Op.getOperand(1);
SDValue FalseV = Op.getOperand(2);
SDLoc DL(Op);
MVT XLenVT = Subtarget.getXLenVT();
// If the result type is XLenVT and CondV is the output of a SETCC node
// which also operated on XLenVT inputs, then merge the SETCC node into the
// lowered RISCVISD::SELECT_CC to take advantage of the integer
// compare+branch instructions. i.e.:
// (select (setcc lhs, rhs, cc), truev, falsev)
// -> (riscvisd::select_cc lhs, rhs, cc, truev, falsev)
if (Op.getSimpleValueType() == XLenVT && CondV.getOpcode() == ISD::SETCC &&
CondV.getOperand(0).getSimpleValueType() == XLenVT) {
SDValue LHS = CondV.getOperand(0);
SDValue RHS = CondV.getOperand(1);
auto CC = cast<CondCodeSDNode>(CondV.getOperand(2));
ISD::CondCode CCVal = CC->get();
normaliseSetCC(LHS, RHS, CCVal);
SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
// Otherwise:
// (select condv, truev, falsev)
// -> (riscvisd::select_cc condv, zero, setne, truev, falsev)
SDValue Zero = DAG.getConstant(0, DL, XLenVT);
SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};
return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();
SDLoc DL(Op);
SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
Register FrameReg = RI.getFrameRegister(MF);
int XLenInBytes = Subtarget.getXLen() / 8;
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
while (Depth--) {
int Offset = -(XLenInBytes * 2);
SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
DAG.getIntPtrConstant(Offset, DL));
FrameAddr =
DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
return FrameAddr;
SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MVT XLenVT = Subtarget.getXLenVT();
int XLenInBytes = Subtarget.getXLen() / 8;
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
if (Depth) {
int Off = -XLenInBytes;
SDValue FrameAddr = lowerFRAMEADDR(Op, DAG);
SDValue Offset = DAG.getConstant(Off, DL, VT);
return DAG.getLoad(VT, DL, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
// Return the value of the return address register, marking it an implicit
// live-in.
Register Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
SDValue Shamt = Op.getOperand(2);
EVT VT = Lo.getValueType();
// if Shamt-XLEN < 0: // Shamt < XLEN
// Lo = Lo << Shamt
// Hi = (Hi << Shamt) | ((Lo >>u 1) >>u (XLEN-1 - Shamt))
// else:
// Lo = 0
// Hi = Lo << (Shamt-XLEN)
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue One = DAG.getConstant(1, DL, VT);
SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);
SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
SDValue ShiftRightLo =
DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, XLenMinus1Shamt);
SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusXLen);
SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);
Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero);
Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
SDValue Parts[2] = {Lo, Hi};
return DAG.getMergeValues(Parts, DL);
SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
bool IsSRA) const {
SDLoc DL(Op);
SDValue Lo = Op.getOperand(0);
SDValue Hi = Op.getOperand(1);
SDValue Shamt = Op.getOperand(2);
EVT VT = Lo.getValueType();
// SRA expansion:
// if Shamt-XLEN < 0: // Shamt < XLEN
// Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt))
// Hi = Hi >>s Shamt
// else:
// Lo = Hi >>s (Shamt-XLEN);
// Hi = Hi >>s (XLEN-1)
// SRL expansion:
// if Shamt-XLEN < 0: // Shamt < XLEN
// Lo = (Lo >>u Shamt) | ((Hi << 1) << (XLEN-1 - Shamt))
// Hi = Hi >>u Shamt
// else:
// Lo = Hi >>u (Shamt-XLEN);
// Hi = 0;
unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL;
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue One = DAG.getConstant(1, DL, VT);
SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);
SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
SDValue ShiftLeftHi =
DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, XLenMinus1Shamt);
SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi);
SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt);
SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusXLen);
SDValue HiFalse =
IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, XLenMinus1) : Zero;
SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);
Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse);
Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);
SDValue Parts[2] = {Lo, Hi};
return DAG.getMergeValues(Parts, DL);
SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc DL(Op);
switch (IntNo) {
return SDValue(); // Don't custom lower most intrinsics.
case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getRegister(RISCV::X4, PtrVT);
// Returns the opcode of the target-specific SDNode that implements the 32-bit
// form of the given Opcode.
static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
switch (Opcode) {
llvm_unreachable("Unexpected opcode");
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::SDIV:
case ISD::UDIV:
case ISD::UREM:
// Converts the given 32-bit operation to a target-specific SelectionDAG node.
// Because i32 isn't a legal type for RV64, these operations would otherwise
// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
// later one because the fact the operation was originally of type i32 is
// lost.
static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
// ReplaceNodeResults requires we maintain the same type for the return value.
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
// Converts the given 32-bit operation to a i64 operation with signed extension
// semantic to reduce the signed extension instructions.
static SDValue customLegalizeToWOpWithSExt(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
SDValue NewWOp = DAG.getNode(N->getOpcode(), DL, MVT::i64, NewOp0, NewOp1);
SDValue NewRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, NewWOp,
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDLoc DL(N);
switch (N->getOpcode()) {
llvm_unreachable("Don't know how to custom type legalize this operation!");
case ISD::FP_TO_UINT: {
bool IsStrict = N->isStrictFPOpcode();
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
SDValue Op0 = IsStrict ? N->getOperand(1) : N->getOperand(0);
RTLIB::Libcall LC;
if (N->getOpcode() == ISD::FP_TO_SINT ||
N->getOpcode() == ISD::STRICT_FP_TO_SINT)
LC = RTLIB::getFPTOSINT(Op0.getValueType(), N->getValueType(0));
LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0));
MakeLibCallOptions CallOptions;
EVT OpVT = Op0.getValueType();
CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
SDValue Result;
std::tie(Result, Chain) =
makeLibCall(DAG, LC, N->getValueType(0), Op0, CallOptions, DL, Chain);
if (IsStrict)
assert(!Subtarget.is64Bit() &&
"READCYCLECOUNTER only has custom type legalization on riscv32");
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue RCW =
DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0));
DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, RCW, RCW.getValue(1)));
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
if (N->getOperand(1).getOpcode() == ISD::Constant)
Results.push_back(customLegalizeToWOpWithSExt(N, DAG));
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
if (N->getOperand(1).getOpcode() == ISD::Constant)
Results.push_back(customLegalizeToWOp(N, DAG));
case ISD::SDIV:
case ISD::UDIV:
case ISD::UREM:
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
Subtarget.hasStdExtM() && "Unexpected custom legalisation");
if (N->getOperand(0).getOpcode() == ISD::Constant ||
N->getOperand(1).getOpcode() == ISD::Constant)
Results.push_back(customLegalizeToWOp(N, DAG));
case ISD::BITCAST: {
assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
Subtarget.hasStdExtF() && "Unexpected custom legalisation");
SDLoc DL(N);
SDValue Op0 = N->getOperand(0);
if (Op0.getValueType() != MVT::f32)
SDValue FPConv =
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
case RISCVISD::SplitF64: {
SDValue Op0 = N->getOperand(0);
// If the input to SplitF64 is just BuildPairF64 then the operation is
// redundant. Instead, use BuildPairF64's operands directly.
if (Op0->getOpcode() == RISCVISD::BuildPairF64)
return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));
SDLoc DL(N);
// It's cheaper to materialise two 32-bit integers than to load a double
// from the constant pool and transfer it to integer registers through the
// stack.
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op0)) {
APInt V = C->getValueAPF().bitcastToAPInt();
SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32);
SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32);
return DCI.CombineTo(N, Lo, Hi);
// This is a target-specific version of a DAGCombine performed in
// DAGCombiner::visitBITCAST. It performs the equivalent of:
// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) ||
SDValue NewSplitF64 =
DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32),
SDValue Lo = NewSplitF64.getValue(0);
SDValue Hi = NewSplitF64.getValue(1);
APInt SignBit = APInt::getSignMask(32);
if (Op0.getOpcode() == ISD::FNEG) {
SDValue NewHi = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi,
DAG.getConstant(SignBit, DL, MVT::i32));
return DCI.CombineTo(N, Lo, NewHi);
assert(Op0.getOpcode() == ISD::FABS);
SDValue NewHi = DAG.getNode(ISD::AND, DL, MVT::i32, Hi,
DAG.getConstant(~SignBit, DL, MVT::i32));
return DCI.CombineTo(N, Lo, NewHi);
// Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) ||
(SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)))
return SDValue();
SDLoc DL(N);
SDValue Op0 = N->getOperand(0);
// If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the
// conversion is unnecessary and can be replaced with an ANY_EXTEND
// of the FMV_W_X_RV64 operand.
if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
SDValue AExtOp =
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0));
return DCI.CombineTo(N, AExtOp);
// This is a target-specific version of a DAGCombine performed in
// DAGCombiner::visitBITCAST. It performs the equivalent of:
// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
if (!(Op0.getOpcode() == ISD::FNEG || Op0.getOpcode() == ISD::FABS) ||
APInt SignBit = APInt::getSignMask(32).sext(64);
if (Op0.getOpcode() == ISD::FNEG) {
return DCI.CombineTo(N,
DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
DAG.getConstant(SignBit, DL, MVT::i64)));
assert(Op0.getOpcode() == ISD::FABS);
return DCI.CombineTo(N,
DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
DAG.getConstant(~SignBit, DL, MVT::i64)));
return SDValue();
bool RISCVTargetLowering::isDesirableToCommuteWithShift(
const SDNode *N, CombineLevel Level) const {
// The following folds are only desirable if `(OP _, c1 << c2)` can be
// materialised in fewer instructions than `(OP _, c1)`:
// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
SDValue N0 = N->getOperand(0);
EVT Ty = N0.getValueType();
if (Ty.isScalarInteger() &&
(N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR)) {
auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (C1 && C2) {
APInt C1Int = C1->getAPIntValue();
APInt ShiftedC1Int = C1Int << C2->getAPIntValue();
// We can materialise `c1 << c2` into an add immediate, so it's "free",
// and the combine should happen, to potentially allow further combines
// later.
if (ShiftedC1Int.getMinSignedBits() <= 64 &&
return true;
// We can materialise `c1` in an add immediate, so it's "free", and the
// combine should be prevented.
if (C1Int.getMinSignedBits() <= 64 &&
return false;
// Neither constant will fit into an immediate, so find materialisation
// costs.
int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(),
int ShiftedC1Cost = RISCVMatInt::getIntMatCost(
ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit());
// Materialising `c1` is cheaper than materialising `c1 << c2`, so the
// combine should be prevented.
if (C1Cost < ShiftedC1Cost)
return false;
return true;
unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
switch (Op.getOpcode()) {
// TODO: As the result is sign-extended, this is conservatively correct. A
// more precise answer could be calculated for SRAW depending on known
// bits in the shift amount.
return 33;
return 1;
static MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");
// To read the 64-bit cycle CSR on a 32-bit target, we read the two halves.
// Should the count have wrapped while it was being read, we need to try
// again.
// ...
// read:
// rdcycleh x3 # load high word of cycle
// rdcycle x2 # load low word of cycle
// rdcycleh x4 # load high word of cycle
// bne x3, x4, read # check if high word reads match, otherwise try again
// ...
MachineFunction &MF = *BB->getParent();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
MachineFunction::iterator It = ++BB->getIterator();
MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
MF.insert(It, LoopMBB);
MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB);
MF.insert(It, DoneMBB);
// Transfer the remainder of BB and its successor edges to DoneMBB.
DoneMBB->splice(DoneMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
MachineRegisterInfo &RegInfo = MF.getRegInfo();
Register ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
Register LoReg = MI.getOperand(0).getReg();
Register HiReg = MI.getOperand(1).getReg();
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg)
BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg)
BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg)
BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
return DoneMBB;
static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction");
MachineFunction &MF = *BB->getParent();
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
Register LoReg = MI.getOperand(0).getReg();
Register HiReg = MI.getOperand(1).getReg();
Register SrcReg = MI.getOperand(2).getReg();
const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);
TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
MachineMemOperand::MOLoad, 8, Align(8));
BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
assert(MI.getOpcode() == RISCV::BuildPairF64Pseudo &&
"Unexpected instruction");
MachineFunction &MF = *BB->getParent();
DebugLoc DL = MI.getDebugLoc();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
Register DstReg = MI.getOperand(0).getReg();
Register LoReg = MI.getOperand(1).getReg();
Register HiReg = MI.getOperand(2).getReg();
const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
MachineMemOperand::MOStore, 8, Align(8));
BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
.addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
.addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
static bool isSelectPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
return false;
case RISCV::Select_GPR_Using_CC_GPR:
case RISCV::Select_FPR32_Using_CC_GPR:
case RISCV::Select_FPR64_Using_CC_GPR:
return true;
static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
MachineBasicBlock *BB) {
// To "insert" Select_* instructions, we actually have to insert the triangle
// control-flow pattern. The incoming instructions know the destination vreg
// to set, the condition code register to branch on, the true/false values to
// select between, and the condcode to use to select the appropriate branch.
// We produce the following control flow:
// HeadMBB
// | \
// | IfFalseMBB
// | /
// TailMBB
// When we find a sequence of selects we attempt to optimize their emission
// by sharing the control flow. Currently we only handle cases where we have
// multiple selects with the exact same condition (same LHS, RHS and CC).
// The selects may be interleaved with other instructions if the other
// instructions meet some requirements we deem safe:
// - They are debug instructions. Otherwise,
// - They do not have side-effects, do not access memory and their inputs do
// not depend on the results of the select pseudo-instructions.
// The TrueV/FalseV operands of the selects cannot depend on the result of
// previous selects in the sequence.
// These conditions could be further relaxed. See the X86 target for a
// related approach and more information.
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());
SmallVector<MachineInstr *, 4> SelectDebugValues;
SmallSet<Register, 4> SelectDests;
MachineInstr *LastSelectPseudo = &MI;
for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
SequenceMBBI != E; ++SequenceMBBI) {
if (SequenceMBBI->isDebugInstr())
else if (isSelectPseudo(*SequenceMBBI)) {
if (SequenceMBBI->getOperand(1).getReg() != LHS ||
SequenceMBBI->getOperand(2).getReg() != RHS ||
SequenceMBBI->getOperand(3).getImm() != CC ||
SelectDests.count(SequenceMBBI->getOperand(4).getReg()) ||
LastSelectPseudo = &*SequenceMBBI;
} else {
if (SequenceMBBI->hasUnmodeledSideEffects() ||
if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator I = ++BB->getIterator();
MachineBasicBlock *HeadMBB = BB;
MachineFunction *F = BB->getParent();
MachineBasicBlock *TailMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *IfFalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(I, IfFalseMBB);
F->insert(I, TailMBB);
// Transfer debug instructions associated with the selects to TailMBB.
for (MachineInstr *DebugInstr : SelectDebugValues) {
// Move all instructions after the sequence to TailMBB.
TailMBB->splice(TailMBB->end(), HeadMBB,
std::next(LastSelectPseudo->getIterator()), HeadMBB->end());
// Update machine-CFG edges by transferring all successors of the current
// block to the new block which will contain the Phi nodes for the selects.
// Set the successors for HeadMBB.
// Insert appropriate branch.
unsigned Opcode = getBranchOpcodeForIntCondCode(CC);
BuildMI(HeadMBB, DL, TII.get(Opcode))
// IfFalseMBB just falls through to TailMBB.
// Create PHIs for all of the select pseudo-instructions.
auto SelectMBBI = MI.getIterator();
auto SelectEnd = std::next(LastSelectPseudo->getIterator());
auto InsertionPoint = TailMBB->begin();
while (SelectMBBI != SelectEnd) {
auto Next = std::next(SelectMBBI);
if (isSelectPseudo(*SelectMBBI)) {
// %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ]
BuildMI(*TailMBB, InsertionPoint, SelectMBBI->getDebugLoc(),
TII.get(RISCV::PHI), SelectMBBI->getOperand(0).getReg())
SelectMBBI = Next;
return TailMBB;
MachineBasicBlock *
RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
llvm_unreachable("Unexpected instr type to insert");
case RISCV::ReadCycleWide:
assert(!Subtarget.is64Bit() &&
"ReadCycleWrite is only to be used on riscv32");
return emitReadCycleWidePseudo(MI, BB);
case RISCV::Select_GPR_Using_CC_GPR:
case RISCV::Select_FPR32_Using_CC_GPR:
case RISCV::Select_FPR64_Using_CC_GPR:
return emitSelectPseudo(MI, BB);
case RISCV::BuildPairF64Pseudo:
return emitBuildPairF64Pseudo(MI, BB);
case RISCV::SplitF64Pseudo:
return emitSplitF64Pseudo(MI, BB);
// Calling Convention Implementation.
// The expectations for frontend ABI lowering vary from target to target.
// Ideally, an LLVM frontend would be able to avoid worrying about many ABI
// details, but this is a longer term goal. For now, we simply try to keep the
// role of the frontend as simple and well-defined as possible. The rules can
// be summarised as:
// * Never split up large scalar arguments. We handle them here.
// * If a hardfloat calling convention is being used, and the struct may be
// passed in a pair of registers (fp+fp, int+fp), and both registers are
// available, then pass as two separate arguments. If either the GPRs or FPRs
// are exhausted, then pass according to the rule below.
// * If a struct could never be passed in registers or directly in a stack
// slot (as it is larger than 2*XLEN and the floating point rules don't
// apply), then pass it using a pointer with the byval attribute.
// * If a struct is less than 2*XLEN, then coerce to either a two-element
// word-sized array or a 2*XLEN scalar (depending on alignment).
// * The frontend can determine whether a struct is returned by reference or
// not based on its size and fields. If it will be returned by reference, the
// frontend must modify the prototype so a pointer with the sret annotation is
// passed as the first argument. This is not necessary for large scalar
// returns.
// * Struct return values and varargs should be coerced to structs containing
// register-size fields in the same situations they would be for fixed
// arguments.
static const MCPhysReg ArgGPRs[] = {
static const MCPhysReg ArgFPR32s[] = {
static const MCPhysReg ArgFPR64s[] = {
// Pass a 2*XLEN argument that has been split into two XLEN values through
// registers or the stack as necessary.
static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2,
ISD::ArgFlagsTy ArgFlags2) {
unsigned XLenInBytes = XLen / 8;
if (Register Reg = State.AllocateReg(ArgGPRs)) {
// At least one half can be passed via register.
State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
VA1.getLocVT(), CCValAssign::Full));
} else {
// Both halves must be passed on the stack, with proper alignment.
Align StackAlign =
std::max(Align(XLenInBytes), ArgFlags1.getNonZeroOrigAlign());
CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(),
State.AllocateStack(XLenInBytes, StackAlign),
VA1.getLocVT(), CCValAssign::Full));
ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
LocVT2, CCValAssign::Full));
return false;
if (Register Reg = State.AllocateReg(ArgGPRs)) {
// The second half can also be passed via register.
CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
} else {
// The second half is passed via the stack, without additional alignment.
ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
LocVT2, CCValAssign::Full));
return false;
// Implements the RISC-V calling convention. Returns true upon failure.
static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
bool IsRet, Type *OrigTy) {
unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
assert(XLen == 32 || XLen == 64);
MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
// Any return value split in to more than two values can't be returned
// directly.
if (IsRet && ValNo > 1)
return true;
// UseGPRForF32 if targeting one of the soft-float ABIs, if passing a
// variadic argument, or if no F32 argument registers are available.
bool UseGPRForF32 = true;
// UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
// variadic argument, or if no F64 argument registers are available.
bool UseGPRForF64 = true;
switch (ABI) {
llvm_unreachable("Unexpected ABI");
UseGPRForF32 = !IsFixed;
UseGPRForF32 = !IsFixed;
UseGPRForF64 = !IsFixed;
if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s))
UseGPRForF32 = true;
if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s))
UseGPRForF64 = true;
// From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local
// variables rather than directly checking against the target ABI.
if (UseGPRForF32 && ValVT == MVT::f32) {
LocVT = XLenVT;
LocInfo = CCValAssign::BCvt;
} else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
LocVT = MVT::i64;
LocInfo = CCValAssign::BCvt;
// If this is a variadic argument, the RISC-V calling convention requires
// that it is assigned an 'even' or 'aligned' register if it has 8-byte
// alignment (RV32) or 16-byte alignment (RV64). An aligned register should
// be used regardless of whether the original argument was split during
// legalisation or not. The argument will not be passed by registers if the
// original type is larger than 2*XLEN, so the register alignment rule does
// not apply.
unsigned TwoXLenInBytes = (2 * XLen) / 8;
if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) {
unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
// Skip 'odd' register if necessary.
if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1)
SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
assert(PendingLocs.size() == PendingArgFlags.size() &&
"PendingLocs and PendingArgFlags out of sync");
// Handle passing f64 on RV32D with a soft float ABI or when floating point
// registers are exhausted.
if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) {
assert(!ArgFlags.isSplit() && PendingLocs.empty() &&
"Can't lower f64 if it is split");
// Depending on available argument GPRS, f64 may be passed in a pair of
// GPRs, split between a GPR and the stack, or passed completely on the
// stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
// cases.
Register Reg = State.AllocateReg(ArgGPRs);
LocVT = MVT::i32;
if (!Reg) {
unsigned StackOffset = State.AllocateStack(8, Align(8));
CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
return false;
if (!State.AllocateReg(ArgGPRs))
State.AllocateStack(4, Align(4));
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
// Split arguments might be passed indirectly, so keep track of the pending
// values.
if (ArgFlags.isSplit() || !PendingLocs.empty()) {
LocVT = XLenVT;
LocInfo = CCValAssign::Indirect;
CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
if (!ArgFlags.isSplitEnd()) {
return false;
// If the split argument only had two elements, it should be passed directly
// in registers or on the stack.
if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) {
assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()");
// Apply the normal calling convention rules to the first half of the
// split argument.
CCValAssign VA = PendingLocs[0];
ISD::ArgFlagsTy AF = PendingArgFlags[0];
return CC_RISCVAssign2XLen(XLen, State, VA, AF, ValNo, ValVT, LocVT,
// Allocate to a register if possible, or else a stack slot.
Register Reg;
if (ValVT == MVT::f32 && !UseGPRForF32)
Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
else if (ValVT == MVT::f64 && !UseGPRForF64)
Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
Reg = State.AllocateReg(ArgGPRs);
unsigned StackOffset =
Reg ? 0 : State.AllocateStack(XLen / 8, Align(XLen / 8));
// If we reach this point and PendingLocs is non-empty, we must be at the
// end of a split argument that must be passed indirectly.
if (!PendingLocs.empty()) {
assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()");
assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()");
for (auto &It : PendingLocs) {
if (Reg)
return false;
assert((!UseGPRForF32 || !UseGPRForF64 || LocVT == XLenVT) &&
"Expected an XLenVT at this stage");
if (Reg) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
// When an f32 or f64 is passed on the stack, no bit-conversion is needed.
if (ValVT == MVT::f32 || ValVT == MVT::f64) {
LocVT = ValVT;
LocInfo = CCValAssign::Full;
State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
return false;
void RISCVTargetLowering::analyzeInputArgs(
MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
unsigned NumArgs = Ins.size();
FunctionType *FType = MF.getFunction().getFunctionType();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Ins[i].VT;
ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
Type *ArgTy = nullptr;
if (IsRet)
ArgTy = FType->getReturnType();
else if (Ins[i].isOrigArg())
ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) {
LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << '\n');
void RISCVTargetLowering::analyzeOutputArgs(
MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet,
CallLoweringInfo *CLI) const {
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; i++) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
<< EVT(ArgVT).getEVTString() << "\n");
// Convert Val to a ValVT. Should not be called for CCValAssign::Indirect
// values.
static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
const CCValAssign &VA, const SDLoc &DL) {
switch (VA.getLocInfo()) {
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
case CCValAssign::BCvt:
if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
return Val;
// The caller is responsible for loading the full value if the argument is
// passed with CCValAssign::Indirect.
static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
const CCValAssign &VA, const SDLoc &DL) {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
EVT LocVT = VA.getLocVT();
SDValue Val;
const TargetRegisterClass *RC;
switch (LocVT.getSimpleVT().SimpleTy) {
llvm_unreachable("Unexpected register type");
case MVT::i32:
case MVT::i64:
RC = &RISCV::GPRRegClass;
case MVT::f32:
RC = &RISCV::FPR32RegClass;
case MVT::f64:
RC = &RISCV::FPR64RegClass;
Register VReg = RegInfo.createVirtualRegister(RC);
RegInfo.addLiveIn(VA.getLocReg(), VReg);
Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
if (VA.getLocInfo() == CCValAssign::Indirect)
return Val;
return convertLocVTToValVT(DAG, Val, VA, DL);
static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
const CCValAssign &VA, const SDLoc &DL) {
EVT LocVT = VA.getLocVT();
switch (VA.getLocInfo()) {
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
case CCValAssign::BCvt:
if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
return Val;
// The caller is responsible for loading the full value if the argument is
// passed with CCValAssign::Indirect.
static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
const CCValAssign &VA, const SDLoc &DL) {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
EVT LocVT = VA.getLocVT();
EVT ValVT = VA.getValVT();
EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
VA.getLocMemOffset(), /*Immutable=*/true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val;
ISD::LoadExtType ExtType;
switch (VA.getLocInfo()) {
llvm_unreachable("Unexpected CCValAssign::LocInfo");
case CCValAssign::Full:
case CCValAssign::Indirect:
case CCValAssign::BCvt:
Val = DAG.getExtLoad(
ExtType, DL, LocVT, Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT);
return Val;
static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
const CCValAssign &VA, const SDLoc &DL) {
assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 &&
"Unexpected VA");
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
if (VA.isMemLoc()) {
// f64 is passed on the stack.
int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true);
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
return DAG.getLoad(MVT::f64, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
assert(VA.isRegLoc() && "Expected register VA assignment");
Register LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
SDValue Hi;
if (VA.getLocReg() == RISCV::X17) {
// Second half of f64 is passed on the stack.
int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true);
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
MachinePointerInfo::getFixedStack(MF, FI));
} else {
// Second half of f64 is passed in another GPR.
Register HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
// FastCC has less than 1% performance improvement for some particular
// benchmark. But theoretically, it may has benenfit for some cases.
static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
if (LocVT == MVT::i32 || LocVT == MVT::i64) {
// X5 and X6 might be used for save-restore libcall.
static const MCPhysReg GPRList[] = {
RISCV::X29, RISCV::X30, RISCV::X31};
if (unsigned Reg = State.AllocateReg(GPRList)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
if (LocVT == MVT::f32) {
static const MCPhysReg FPR32List[] = {
if (unsigned Reg = State.AllocateReg(FPR32List)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
if (LocVT == MVT::f64) {
static const MCPhysReg FPR64List[] = {
if (unsigned Reg = State.AllocateReg(FPR64List)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
if (LocVT == MVT::i32 || LocVT == MVT::f32) {
unsigned Offset4 = State.AllocateStack(4, Align(4));
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo));
return false;
if (LocVT == MVT::i64 || LocVT == MVT::f64) {
unsigned Offset5 = State.AllocateStack(8, Align(8));
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo));
return false;
return true; // CC didn't match.
// Transform physical registers into virtual registers.
SDValue RISCVTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
switch (CallConv) {
report_fatal_error("Unsupported calling convention");
case CallingConv::C:
case CallingConv::Fast:
MachineFunction &MF = DAG.getMachineFunction();
const Function &Func = MF.getFunction();
if (Func.hasFnAttribute("interrupt")) {
if (!Func.arg_empty())
"Functions with the interrupt attribute cannot have arguments!");
StringRef Kind =
if (!(Kind == "user" || Kind == "supervisor" || Kind == "machine"))
"Function interrupt attribute argument not supported!");
EVT PtrVT = getPointerTy(DAG.getDataLayout());
MVT XLenVT = Subtarget.getXLenVT();
unsigned XLenInBytes = Subtarget.getXLen() / 8;
// Used with vargs to acumulate store chains.
std::vector<SDValue> OutChains;
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
if (CallConv == CallingConv::Fast)
CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC);
analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue ArgValue;
// Passing f64 on RV32D with a soft float ABI must be handled as a special
// case.
if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
else if (VA.isRegLoc())
ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
if (VA.getLocInfo() == CCValAssign::Indirect) {
// If the original argument was split and passed by reference (e.g. i128
// on RV32), we need to load all parts of it here (using the same
// address).
InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
unsigned ArgIndex = Ins[i].OrigArgIndex;
assert(Ins[i].PartOffset == 0);
while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) {
CCValAssign &PartVA = ArgLocs[i + 1];
unsigned PartOffset = Ins[i + 1].PartOffset;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
DAG.getIntPtrConstant(PartOffset, DL));
InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
if (IsVarArg) {
ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(ArgGPRs);
unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
const TargetRegisterClass *RC = &RISCV::GPRRegClass;
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &RegInfo = MF.getRegInfo();
RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
// Offset of the first variable argument from stack pointer, and size of
// the vararg save area. For now, the varargs save area is either zero or
// large enough to hold a0-a7.
int VaArgOffset, VarArgsSaveSize;
// If all registers are allocated, then all varargs must be passed on the
// stack and we don't need to save any argregs.
if (ArgRegs.size() == Idx) {
VaArgOffset = CCInfo.getNextStackOffset();
VarArgsSaveSize = 0;
} else {
VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx);
VaArgOffset = -VarArgsSaveSize;
// Record the frame index of the first variable argument
// which is a value necessary to VASTART.
int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
// If saving an odd number of registers then create an extra stack slot to
// ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
// offsets to even-numbered registered remain 2*XLEN-aligned.
if (Idx % 2) {
MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes, true);
VarArgsSaveSize += XLenInBytes;
// Copy the integer registers that may have been used for passing varargs
// to the vararg save area.
for (unsigned I = Idx; I < ArgRegs.size();
++I, VaArgOffset += XLenInBytes) {
const Register Reg = RegInfo.createVirtualRegister(RC);
RegInfo.addLiveIn(ArgRegs[I], Reg);
SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
MachinePointerInfo::getFixedStack(MF, FI));
->setValue((Value *)nullptr);
// All stores are grouped in one node to allow the matching between
// the size of Ins and InVals. This only happens for vararg functions.
if (!OutChains.empty()) {
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
return Chain;
/// isEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization.
/// Note: This is modelled after ARM's IsEligibleForTailCallOptimization.
bool RISCVTargetLowering::isEligibleForTailCallOptimization(
CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
const SmallVector<CCValAssign, 16> &ArgLocs) const {
auto &Callee = CLI.Callee;
auto CalleeCC = CLI.CallConv;
auto &Outs = CLI.Outs;
auto &Caller = MF.getFunction();
auto CallerCC = Caller.getCallingConv();
// Exception-handling functions need a special set of instructions to
// indicate a return to the hardware. Tail-calling another function would
// probably break this.
// TODO: The "interrupt" attribute isn't currently defined by RISC-V. This
// should be expanded as new function attributes are introduced.
if (Caller.hasFnAttribute("interrupt"))
return false;
// Do not tail call opt if the stack is used to pass parameters.
if (CCInfo.getNextStackOffset() != 0)
return false;
// Do not tail call opt if any parameters need to be passed indirectly.
// Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
// passed indirectly. So the address of the value will be passed in a
// register, or if not available, then the address is put on the stack. In
// order to pass indirectly, space on the stack often needs to be allocated
// in order to store the value. In this case the CCInfo.getNextStackOffset()
// != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
// are passed CCValAssign::Indirect.
for (auto &VA : ArgLocs)
if (VA.getLocInfo() == CCValAssign::Indirect)
return false;
// Do not tail call opt if either caller or callee uses struct return
// semantics.
auto IsCallerStructRet = Caller.hasStructRetAttr();
auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
if (IsCallerStructRet || IsCalleeStructRet)
return false;
// Externally-defined functions with weak linkage should not be
// tail-called. The behaviour of branch instructions in this situation (as
// used for tail calls) is implementation-defined, so we cannot rely on the
// linker replacing the tail call with a return.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
if (GV->hasExternalWeakLinkage())
return false;
// The callee has to preserve all registers the caller needs to preserve.
const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (CalleeCC != CallerCC) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
// Byval parameters hand the function a pointer directly into the stack area
// we want to reuse during a tail call. Working around this *is* possible
// but less efficient and uglier in LowerCall.
for (auto &Arg : Outs)
if (Arg.Flags.isByVal())
return false;
return true;
// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
// and output parameter nodes.
SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
MVT XLenVT = Subtarget.getXLenVT();
MachineFunction &MF = DAG.getMachineFunction();
// Analyze the operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
if (CallConv == CallingConv::Fast)
ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC);
analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
// Check if it's really possible to do a tail call.
if (IsTailCall)
IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs);
if (IsTailCall)
else if (CLI.CB && CLI.CB->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = ArgCCInfo.getNextStackOffset();
// Create local copies for byval args
SmallVector<SDValue, 8> ByValArgs;
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
ISD::ArgFlagsTy Flags = Outs[i].Flags;
if (!Flags.isByVal())
SDValue Arg = OutVals[i];
unsigned Size = Flags.getByValSize();
Align Alignment = Flags.getNonZeroByValAlign();
int FI =
MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false);
SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT);
Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment,
/*AlwaysInline=*/false, IsTailCall,
MachinePointerInfo(), MachinePointerInfo());
if (!IsTailCall)
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
// Copy argument values to their designated locations.
SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue ArgValue = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
// Handle passing f64 on RV32D with a soft float ABI as a special case.
bool IsF64OnRV32DSoftABI =
VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
if (IsF64OnRV32DSoftABI && VA.isRegLoc()) {
SDValue SplitF64 = DAG.getNode(
RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
SDValue Lo = SplitF64.getValue(0);
SDValue Hi = SplitF64.getValue(1);
Register RegLo = VA.getLocReg();
RegsToPass.push_back(std::make_pair(RegLo, Lo));
if (RegLo == RISCV::X17) {
// Second half of f64 is passed on the stack.
// Work out the address of the stack slot.
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
// Emit the store.
DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
} else {
// Second half of f64 is passed in another GPR.
assert(RegLo < RISCV::X31 && "Invalid register pair");
Register RegHigh = RegLo + 1;
RegsToPass.push_back(std::make_pair(RegHigh, Hi));
// IsF64OnRV32DSoftABI && VA.isMemLoc() is handled below in the same way
// as any other MemLoc.
// Promote the value if needed.
// For now, only handle fully promoted and indirect arguments.
if (VA.getLocInfo() == CCValAssign::Indirect) {
// Store the argument in a stack slot and pass its address.
SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT);
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
DAG.getStore(Chain, DL, ArgValue, SpillSlot,
MachinePointerInfo::getFixedStack(MF, FI)));
// If the original argument was split (e.g. i128), we need
// to store all parts of it here (and pass just one address).
unsigned ArgIndex = Outs[i].OrigArgIndex;
assert(Outs[i].PartOffset == 0);
while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) {
SDValue PartValue = OutVals[i + 1];
unsigned PartOffset = Outs[i + 1].PartOffset;
SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
DAG.getIntPtrConstant(PartOffset, DL));
DAG.getStore(Chain, DL, PartValue, Address,
MachinePointerInfo::getFixedStack(MF, FI)));
ArgValue = SpillSlot;
} else {
ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
// Use local copy if it is a byval arg.
if (Flags.isByVal())
ArgValue = ByValArgs[j++];
if (VA.isRegLoc()) {
// Queue up the argument copies and emit them at the end.
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
} else {
assert(VA.isMemLoc() && "Argument not register or memory");
assert(!IsTailCall && "Tail call not allowed if stack is used "
"for passing parameters");
// Work out the address of the stack slot.
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
SDValue Address =
DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
// Emit the store.
DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
// Join the stores, which are independent of one another.
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
SDValue Glue;
// Build a sequence of copy-to-reg nodes, chained and glued together.
for (auto &Reg : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
Glue = Chain.getValue(1);
// Validate that none of the argument registers have been marked as
// reserved, if so report an error. Do the same for the return address if this
// is not a tailcall.
validateCCReservedRegs(RegsToPass, MF);
if (!IsTailCall &&
"Return address register required, but has been reserved."});
// If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
// TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
// split it and then direct call can be matched by PseudoCALL.
if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = S->getGlobal();
unsigned OpFlags = RISCVII::MO_CALL;
if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
unsigned OpFlags = RISCVII::MO_CALL;
if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(),
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
// The first call operand is the chain and the second is the target address.
SmallVector<SDValue, 8> Ops;
// Add argument registers to the end of the list so that they are
// known live into the call.
for (auto &Reg : RegsToPass)
Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
if (!IsTailCall) {
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
// Glue the call to the argument copies, if any.
if (Glue.getNode())
// Emit the call.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
if (IsTailCall) {
return DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops);
Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
Glue = Chain.getValue(1);
// Mark the end of the call, which is glued to the call itself.
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getConstant(NumBytes, DL, PtrVT, true),
DAG.getConstant(0, DL, PtrVT, true),
Glue, DL);
Glue = Chain.getValue(1);
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true);
// Copy all of the result registers out of their specified physreg.
for (auto &VA : RVLocs) {
// Copy the value out
SDValue RetValue =
DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
// Glue the RetValue to the end of the call sequence
Chain = RetValue.getValue(1);
Glue = RetValue.getValue(2);
if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
assert(VA.getLocReg() == ArgGPRs[0] && "Unexpected reg assignment");
SDValue RetValue2 =
DAG.getCopyFromReg(Chain, DL, ArgGPRs[1], MVT::i32, Glue);
Chain = RetValue2.getValue(1);
Glue = RetValue2.getValue(2);
RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue,
RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);
return Chain;
bool RISCVTargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
MVT VT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr))
return false;
return true;
RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
const RISCVSubtarget &STI = MF.getSubtarget<RISCVSubtarget>();
// Stores the assignment of the return value to a location.
SmallVector<CCValAssign, 16> RVLocs;
// Info about the registers and stack slot.
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true,
SDValue Glue;
SmallVector<SDValue, 4> RetOps(1, Chain);
// Copy the result values into the output registers.
for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) {
SDValue Val = OutVals[i];
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
// Handle returning f64 on RV32D with a soft float ABI.
assert(VA.isRegLoc() && "Expected return via registers");
SDValue SplitF64 = DAG.getNode(RISCVISD::SplitF64, DL,
DAG.getVTList(MVT::i32, MVT::i32), Val);
SDValue Lo = SplitF64.getValue(0);
SDValue Hi = SplitF64.getValue(1);
Register RegLo = VA.getLocReg();
assert(RegLo < RISCV::X31 && "Invalid register pair");
Register RegHi = RegLo + 1;
if (STI.isRegisterReservedByUser(RegLo) ||
"Return value register required, but has been reserved."});
Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
} else {
// Handle a 'normal' return.
Val = convertValVTToLocVT(DAG, Val, VA, DL);
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);
if (STI.isRegisterReservedByUser(VA.getLocReg()))
"Return value register required, but has been reserved."});
// Guarantee that all emitted copies are stuck together.
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
RetOps[0] = Chain; // Update chain.
// Add the glue node if we have it.
if (Glue.getNode()) {
// Interrupt service routines use different return instructions.
const Function &Func = DAG.getMachineFunction().getFunction();
if (Func.hasFnAttribute("interrupt")) {
if (!Func.getReturnType()->isVoidTy())
"Functions with the interrupt attribute must have void return type!");
MachineFunction &MF = DAG.getMachineFunction();
StringRef Kind =
unsigned RetOpc;
if (Kind == "user")
else if (Kind == "supervisor")
return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps);
void RISCVTargetLowering::validateCCReservedRegs(
const SmallVectorImpl<std::pair<llvm::Register, llvm::SDValue>> &Regs,
MachineFunction &MF) const {
const Function &F = MF.getFunction();
const RISCVSubtarget &STI = MF.getSubtarget<RISCVSubtarget>();
if (std::any_of(std::begin(Regs), std::end(Regs), [&STI](auto Reg) {
return STI.isRegisterReservedByUser(Reg.first);
F, "Argument register required, but has been reserved."});
bool RISCVTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return CI->isTailCall();
const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((RISCVISD::NodeType)Opcode) {
return "RISCVISD::CALL";
case RISCVISD::BuildPairF64:
return "RISCVISD::BuildPairF64";
case RISCVISD::SplitF64:
return "RISCVISD::SplitF64";
return "RISCVISD::TAIL";
return "RISCVISD::SLLW";
return "RISCVISD::SRAW";
return "RISCVISD::SRLW";
return "RISCVISD::DIVW";
return "RISCVISD::FMV_W_X_RV64";
return nullptr;
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'f':
return C_RegisterClass;
case 'I':
case 'J':
case 'K':
return C_Immediate;
case 'A':
return C_Memory;
return TargetLowering::getConstraintType(Constraint);
std::pair<unsigned, const TargetRegisterClass *>
RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
// First, see if this is a constraint that directly corresponds to a
// RISCV register class.
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
return std::make_pair(0U, &RISCV::GPRRegClass);
case 'f':
if (Subtarget.hasStdExtF() && VT == MVT::f32)
return std::make_pair(0U, &RISCV::FPR32RegClass);
if (Subtarget.hasStdExtD() && VT == MVT::f64)
return std::make_pair(0U, &RISCV::FPR64RegClass);
// Clang will correctly decode the usage of register name aliases into their
// official names. However, other frontends like `rustc` do not. This allows
// users of these frontends to use the ABI names for registers in LLVM-style
// register constraints.
Register XRegFromAlias = StringSwitch<Register>(Constraint.lower())
.Case("{zero}", RISCV::X0)
.Case("{ra}", RISCV::X1)
.Case("{sp}", RISCV::X2)
.Case("{gp}", RISCV::X3)
.Case("{tp}", RISCV::X4)
.Case("{t0}", RISCV::X5)
.Case("{t1}", RISCV::X6)
.Case("{t2}", RISCV::X7)
.Cases("{s0}", "{fp}", RISCV::X8)
.Case("{s1}", RISCV::X9)
.Case("{a0}", RISCV::X10)
.Case("{a1}", RISCV::X11)
.Case("{a2}", RISCV::X12)
.Case("{a3}", RISCV::X13)
.Case("{a4}", RISCV::X14)
.Case("{a5}", RISCV::X15)
.Case("{a6}", RISCV::X16)
.Case("{a7}", RISCV::X17)
.Case("{s2}", RISCV::X18)
.Case("{s3}", RISCV::X19)
.Case("{s4}", RISCV::X20)
.Case("{s5}", RISCV::X21)
.Case("{s6}", RISCV::X22)
.Case("{s7}", RISCV::X23)
.Case("{s8}", RISCV::X24)
.Case("{s9}", RISCV::X25)
.Case("{s10}", RISCV::X26)
.Case("{s11}", RISCV::X27)
.Case("{t3}", RISCV::X28)
.Case("{t4}", RISCV::X29)
.Case("{t5}", RISCV::X30)
.Case("{t6}", RISCV::X31)
if (XRegFromAlias != RISCV::NoRegister)
return std::make_pair(XRegFromAlias, &RISCV::GPRRegClass);
// Since TargetLowering::getRegForInlineAsmConstraint uses the name of the
// TableGen record rather than the AsmName to choose registers for InlineAsm
// constraints, plus we want to match those names to the widest floating point
// register type available, manually select floating point registers here.
// The second case is the ABI name of the register, so that frontends can also
// use the ABI names in register constraint lists.
if (Subtarget.hasStdExtF() || Subtarget.hasStdExtD()) {
std::pair<Register, Register> FReg =
StringSwitch<std::pair<Register, Register>>(Constraint.lower())
.Cases("{f0}", "{ft0}", {RISCV::F0_F, RISCV::F0_D})
.Cases("{f1}", "{ft1}", {RISCV::F1_F, RISCV::F1_D})
.Cases("{f2}", "{ft2}", {RISCV::F2_F, RISCV::F2_D})
.Cases("{f3}", "{ft3}", {RISCV::F3_F, RISCV::F3_D})
.Cases("{f4}", "{ft4}", {RISCV::F4_F, RISCV::F4_D})
.Cases("{f5}", "{ft5}", {RISCV::F5_F, RISCV::F5_D})
.Cases("{f6}", "{ft6}", {RISCV::F6_F, RISCV::F6_D})
.Cases("{f7}", "{ft7}", {RISCV::F7_F, RISCV::F7_D})
.Cases("{f8}", "{fs0}", {RISCV::F8_F, RISCV::F8_D})
.Cases("{f9}", "{fs1}", {RISCV::F9_F, RISCV::F9_D})
.Cases("{f10}", "{fa0}", {RISCV::F10_F, RISCV::F10_D})
.Cases("{f11}", "{fa1}", {RISCV::F11_F, RISCV::F11_D})
.Cases("{f12}", "{fa2}", {RISCV::F12_F, RISCV::F12_D})
.Cases("{f13}", "{fa3}", {RISCV::F13_F, RISCV::F13_D})
.Cases("{f14}", "{fa4}", {RISCV::F14_F, RISCV::F14_D})
.Cases("{f15}", "{fa5}", {RISCV::F15_F, RISCV::F15_D})
.Cases("{f16}", "{fa6}", {RISCV::F16_F, RISCV::F16_D})
.Cases("{f17}", "{fa7}", {RISCV::F17_F, RISCV::F17_D})
.Cases("{f18}", "{fs2}", {RISCV::F18_F, RISCV::F18_D})
.Cases("{f19}", "{fs3}", {RISCV::F19_F, RISCV::F19_D})
.Cases("{f20}", "{fs4}", {RISCV::F20_F, RISCV::F20_D})
.Cases("{f21}", "{fs5}", {RISCV::F21_F, RISCV::F21_D})
.Cases("{f22}", "{fs6}", {RISCV::F22_F, RISCV::F22_D})
.Cases("{f23}", "{fs7}", {RISCV::F23_F, RISCV::F23_D})
.Cases("{f24}", "{fs8}", {RISCV::F24_F, RISCV::F24_D})
.Cases("{f25}", "{fs9}", {RISCV::F25_F, RISCV::F25_D})
.Cases("{f26}", "{fs10}", {RISCV::F26_F, RISCV::F26_D})
.Cases("{f27}", "{fs11}", {RISCV::F27_F, RISCV::F27_D})
.Cases("{f28}", "{ft8}", {RISCV::F28_F, RISCV::F28_D})
.Cases("{f29}", "{ft9}", {RISCV::F29_F, RISCV::F29_D})
.Cases("{f30}", "{ft10}", {RISCV::F30_F, RISCV::F30_D})
.Cases("{f31}", "{ft11}", {RISCV::F31_F, RISCV::F31_D})
.Default({RISCV::NoRegister, RISCV::NoRegister});
if (FReg.first != RISCV::NoRegister)
return Subtarget.hasStdExtD()
? std::make_pair(FReg.second, &RISCV::FPR64RegClass)
: std::make_pair(FReg.first, &RISCV::FPR32RegClass);
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
RISCVTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
// Currently only support length 1 constraints.
if (ConstraintCode.size() == 1) {
switch (ConstraintCode[0]) {
case 'A':
return InlineAsm::Constraint_A;
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
void RISCVTargetLowering::LowerAsmOperandForConstraint(
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
// Currently only support length 1 constraints.
if (Constraint.length() == 1) {
switch (Constraint[0]) {
case 'I':
// Validate & create a 12-bit signed immediate operand.
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
uint64_t CVal = C->getSExtValue();
if (isInt<12>(CVal))
DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
case 'J':
// Validate & create an integer zero operand.
if (auto *C = dyn_cast<ConstantSDNode>(Op))
if (C->getZExtValue() == 0)
DAG.getTargetConstant(0, SDLoc(Op), Subtarget.getXLenVT()));
case 'K':
// Validate & create a 5-bit unsigned immediate operand.
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
uint64_t CVal = C->getZExtValue();
if (isUInt<5>(CVal))
DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
return Builder.CreateFence(Ord);
if (isa<StoreInst>(Inst) && isReleaseOrStronger(Ord))
return Builder.CreateFence(AtomicOrdering::Release);
return nullptr;
Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
if (isa<LoadInst>(Inst) && isAcquireOrStronger(Ord))
return Builder.CreateFence(AtomicOrdering::Acquire);
return nullptr;
RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
// atomicrmw {fadd,fsub} must be expanded to use compare-exchange, as floating
// point operations can't be used in an lr/sc sequence without breaking the
// forward-progress guarantee.
if (AI->isFloatingPointOperation())
return AtomicExpansionKind::CmpXChg;
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
if (Size == 8 || Size == 16)
return AtomicExpansionKind::MaskedIntrinsic;
return AtomicExpansionKind::None;
static Intrinsic::ID
getIntrinsicForMaskedAtomicRMWBinOp(unsigned XLen, AtomicRMWInst::BinOp BinOp) {
if (XLen == 32) {
switch (BinOp) {
llvm_unreachable("Unexpected AtomicRMW BinOp");
case AtomicRMWInst::Xchg:
return Intrinsic::riscv_masked_atomicrmw_xchg_i32;
case AtomicRMWInst::Add:
return Intrinsic::riscv_masked_atomicrmw_add_i32;
case AtomicRMWInst::Sub:
return Intrinsic::riscv_masked_atomicrmw_sub_i32;
case AtomicRMWInst::Nand:
return Intrinsic::riscv_masked_atomicrmw_nand_i32;
case AtomicRMWInst::Max:
return Intrinsic::riscv_masked_atomicrmw_max_i32;
case AtomicRMWInst::Min:
return Intrinsic::riscv_masked_atomicrmw_min_i32;
case AtomicRMWInst::UMax:
return Intrinsic::riscv_masked_atomicrmw_umax_i32;
case AtomicRMWInst::UMin:
return Intrinsic::riscv_masked_atomicrmw_umin_i32;
if (XLen == 64) {
switch (BinOp) {
llvm_unreachable("Unexpected AtomicRMW BinOp");
case AtomicRMWInst::Xchg:
return Intrinsic::riscv_masked_atomicrmw_xchg_i64;
case AtomicRMWInst::Add:
return Intrinsic::riscv_masked_atomicrmw_add_i64;
case AtomicRMWInst::Sub:
return Intrinsic::riscv_masked_atomicrmw_sub_i64;
case AtomicRMWInst::Nand:
return Intrinsic::riscv_masked_atomicrmw_nand_i64;
case AtomicRMWInst::Max:
return Intrinsic::riscv_masked_atomicrmw_max_i64;
case AtomicRMWInst::Min:
return Intrinsic::riscv_masked_atomicrmw_min_i64;
case AtomicRMWInst::UMax:
return Intrinsic::riscv_masked_atomicrmw_umax_i64;
case AtomicRMWInst::UMin:
return Intrinsic::riscv_masked_atomicrmw_umin_i64;
llvm_unreachable("Unexpected XLen\n");
Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
unsigned XLen = Subtarget.getXLen();
Value *Ordering =
Builder.getIntN(XLen, static_cast<uint64_t>(AI->getOrdering()));
Type *Tys[] = {AlignedAddr->getType()};
Function *LrwOpScwLoop = Intrinsic::getDeclaration(
getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys);
if (XLen == 64) {
Incr = Builder.CreateSExt(Incr, Builder.getInt64Ty());
Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
ShiftAmt = Builder.CreateSExt(ShiftAmt, Builder.getInt64Ty());
Value *Result;
// Must pass the shift amount needed to sign extend the loaded value prior
// to performing a signed comparison for min/max. ShiftAmt is the number of
// bits to shift the value into position. Pass XLen-ShiftAmt-ValWidth, which
// is the number of bits to left+right shift the value in order to
// sign-extend.
if (AI->getOperation() == AtomicRMWInst::Min ||
AI->getOperation() == AtomicRMWInst::Max) {
const DataLayout &DL = AI->getModule()->getDataLayout();
unsigned ValWidth =
Value *SextShamt =
Builder.CreateSub(Builder.getIntN(XLen, XLen - ValWidth), ShiftAmt);
Result = Builder.CreateCall(LrwOpScwLoop,
{AlignedAddr, Incr, Mask, SextShamt, Ordering});
} else {
Result =
Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering});
if (XLen == 64)
Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
return Result;
AtomicCmpXchgInst *CI) const {
unsigned Size = CI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
if (Size == 8 || Size == 16)
return AtomicExpansionKind::MaskedIntrinsic;
return AtomicExpansionKind::None;
Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
IRBuilder<> &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
unsigned XLen = Subtarget.getXLen();
Value *Ordering = Builder.getIntN(XLen, static_cast<uint64_t>(Ord));
Intrinsic::ID CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i32;
if (XLen == 64) {
CmpVal = Builder.CreateSExt(CmpVal, Builder.getInt64Ty());
NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty());
Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64;
Type *Tys[] = {AlignedAddr->getType()};
Function *MaskedCmpXchg =
Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
Value *Result = Builder.CreateCall(
MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
if (XLen == 64)
Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
return Result;
Register RISCVTargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
return RISCV::X10;
Register RISCVTargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
return RISCV::X11;
bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
// Return false to suppress the unnecessary extensions if the LibCall
// arguments or return value is f32 type for LP64 ABI.
RISCVABI::ABI ABI = Subtarget.getTargetABI();
if (ABI == RISCVABI::ABI_LP64 && (Type == MVT::f32))
return false;
return true;
bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const {
// Check integral scalar types.
if (VT.isScalarInteger()) {
// Do not perform the transformation on riscv32 with the M extension.
if (!Subtarget.is64Bit() && Subtarget.hasStdExtM())
return false;
if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
if (ConstNode->getAPIntValue().getBitWidth() > 8 * sizeof(int64_t))
return false;
int64_t Imm = ConstNode->getSExtValue();
if (isPowerOf2_64(Imm + 1) || isPowerOf2_64(Imm - 1) ||
isPowerOf2_64(1 - Imm) || isPowerOf2_64(-1 - Imm))
return true;
return false;
#include ""
RISCVTargetLowering::getRegisterByName(const char *RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = MatchRegisterAltName(RegName);
if (Reg == RISCV::NoRegister)
Reg = MatchRegisterName(RegName);
if (Reg == RISCV::NoRegister)
Twine("Invalid register name \"" + StringRef(RegName) + "\"."));
BitVector ReservedRegs = Subtarget.getRegisterInfo()->getReservedRegs(MF);
if (!ReservedRegs.test(Reg) && !Subtarget.isRegisterReservedByUser(Reg))
report_fatal_error(Twine("Trying to obtain non-reserved register \"" +
StringRef(RegName) + "\"."));
return Reg;
diff --git a/llvm/lib/Target/RISCV/ b/llvm/lib/Target/RISCV/
index 34a463626e29..afac509f743d 100644
--- a/llvm/lib/Target/RISCV/
+++ b/llvm/lib/Target/RISCV/
@@ -1,634 +1,1063 @@
//===-- - RISC-V 'B' instructions -------*- tablegen -*-===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file describes the RISC-V instructions from the standard 'B' Bitmanip
// extension, version 0.92.
// This version is still experimental as the 'B' extension hasn't been
// ratified yet.
// Operand definitions.
def UImmLog2XLenHalfAsmOperand : AsmOperandClass {
let Name = "UImmLog2XLenHalf";
let RenderMethod = "addImmOperands";
let DiagnosticType = "InvalidUImmLog2XLenHalf";
def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{
if (Subtarget->is64Bit())
return isUInt<5>(Imm);
return isUInt<4>(Imm);
}]> {
let ParserMatchClass = UImmLog2XLenHalfAsmOperand;
let DecoderMethod = "decodeUImmOperand<5>";
let MCOperandPredicate = [{
int64_t Imm;
if (!MCOp.evaluateAsConstantImm(Imm))
return false;
if (STI.getTargetTriple().isArch64Bit())
return isUInt<5>(Imm);
return isUInt<4>(Imm);
// Instruction class templates
// Some of these templates should be moved to once the B
// extension has been ratified.
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBUnary<bits<7> funct7, bits<5> funct5, bits<3> funct3,
RISCVOpcode opcode, string opcodestr>
: RVInstR<funct7, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1),
opcodestr, "$rd, $rs1"> {
let Inst{24-20} = funct5;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBALUW_ri<bits<3> funct3, string opcodestr>
: RVInstI<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
(ins GPR:$rs1, simm12:$imm12), opcodestr, "$rd, $rs1, $imm12">;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBShift_ri<bits<5> funct5, bits<3> funct3, RISCVOpcode opcode,
string opcodestr>
: RVInstI<funct3, opcode, (outs GPR:$rd),
(ins GPR:$rs1, uimmlog2xlen:$shamt), opcodestr,
"$rd, $rs1, $shamt"> {
bits<6> shamt;
let Inst{31-27} = funct5;
// NOTE: the bit op(26)=1 is used to select funnel shifts. All other
// shifts operations and operations that live in the encoding space
// of the shifts (single bit operations, grev, gorc) use op(26) = 0
let Inst{26} = 0;
let Inst{25-20} = shamt;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBShiftW_ri<bits<7> funct7, bits<3> funct3, RISCVOpcode opcode,
string opcodestr>
: RVInstI<funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, uimm5:$shamt),
opcodestr, "$rd, $rs1, $shamt"> {
bits<5> shamt;
let Inst{31-25} = funct7;
let Inst{24-20} = shamt;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBShfl_ri<bits<6> funct6, bits<3> funct3, RISCVOpcode opcode,
string opcodestr>
: RVInstI<funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, shfl_uimm:$shamt),
opcodestr, "$rd, $rs1, $shamt"> {
bits<6> shamt;
let Inst{31-26} = funct6;
let Inst{25-20} = shamt;
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBTernaryR<bits<2> funct2, bits<3> funct3_b, RISCVOpcode opcode,
string opcodestr, string argstr>
: RVInstR4<funct2, opcode, (outs GPR:$rd),
(ins GPR:$rs1, GPR:$rs2, GPR:$rs3), opcodestr, argstr> {
let Inst{14-12} = funct3_b;
// Currently used by FSRI only
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBTernaryImm6<bits<3> funct3_b, RISCVOpcode opcode,
string opcodestr, string argstr>
: RVInstR4<0b10, opcode, (outs GPR:$rd),
(ins GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt),
opcodestr, argstr> {
bits<6> shamt;
// NOTE: the first argument of RVInstR4 is hardcoded to 0b10 like the other
// funnel shift instructions. The second bit of the argument though is
// overwritten by the shamt as the encoding of this particular instruction
// requires. This is to obtain op(26) = 1 as required by funnel shift
// instructions without the need of a confusing argument in the definition
// of the instruction.
let Inst{25-20} = shamt;
let Inst{14-12} = funct3_b;
// Currently used by FSRIW only
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBTernaryImm5<bits<2> funct2, bits<3> funct3_b, RISCVOpcode opcode,
string opcodestr, string argstr>
: RVInstR4<funct2, opcode, (outs GPR:$rd),
(ins GPR:$rs1, GPR:$rs3, uimm5:$shamt), opcodestr, argstr> {
bits<5> shamt;
let Inst{24-20} = shamt;
let Inst{14-12} = funct3_b;
// Instructions
let Predicates = [HasStdExtZbbOrZbp] in {
def ANDN : ALU_rr<0b0100000, 0b111, "andn">, Sched<[]>;
def ORN : ALU_rr<0b0100000, 0b110, "orn">, Sched<[]>;
def XNOR : ALU_rr<0b0100000, 0b100, "xnor">, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbb] in {
def SLO : ALU_rr<0b0010000, 0b001, "slo">, Sched<[]>;
def SRO : ALU_rr<0b0010000, 0b101, "sro">, Sched<[]>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbbOrZbp] in {
def ROL : ALU_rr<0b0110000, 0b001, "rol">, Sched<[]>;
def ROR : ALU_rr<0b0110000, 0b101, "ror">, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbs] in {
def SBCLR : ALU_rr<0b0100100, 0b001, "sbclr">, Sched<[]>;
def SBSET : ALU_rr<0b0010100, 0b001, "sbset">, Sched<[]>;
def SBINV : ALU_rr<0b0110100, 0b001, "sbinv">, Sched<[]>;
def SBEXT : ALU_rr<0b0100100, 0b101, "sbext">, Sched<[]>;
} // Predicates = [HasStdExtZbs]
let Predicates = [HasStdExtZbp] in {
def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbb] in {
def SLOI : RVBShift_ri<0b00100, 0b001, OPC_OP_IMM, "sloi">, Sched<[]>;
def SROI : RVBShift_ri<0b00100, 0b101, OPC_OP_IMM, "sroi">, Sched<[]>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbbOrZbp] in
def RORI : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, Sched<[]>;
let Predicates = [HasStdExtZbs] in {
def SBCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "sbclri">, Sched<[]>;
def SBSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "sbseti">, Sched<[]>;
def SBINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "sbinvi">, Sched<[]>;
def SBEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "sbexti">, Sched<[]>;
} // Predicates = [HasStdExtZbs]
let Predicates = [HasStdExtZbp] in {
def GREVI : RVBShift_ri<0b01101, 0b101, OPC_OP_IMM, "grevi">, Sched<[]>;
def GORCI : RVBShift_ri<0b00101, 0b101, OPC_OP_IMM, "gorci">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbt] in {
def CMIX : RVBTernaryR<0b11, 0b001, OPC_OP, "cmix", "$rd, $rs2, $rs1, $rs3">,
def CMOV : RVBTernaryR<0b11, 0b101, OPC_OP, "cmov", "$rd, $rs2, $rs1, $rs3">,
def FSL : RVBTernaryR<0b10, 0b001, OPC_OP, "fsl", "$rd, $rs1, $rs3, $rs2">,
def FSR : RVBTernaryR<0b10, 0b101, OPC_OP, "fsr", "$rd, $rs1, $rs3, $rs2">,
def FSRI : RVBTernaryImm6<0b101, OPC_OP_IMM, "fsri",
"$rd, $rs1, $rs3, $shamt">, Sched<[]>;
} // Predicates = [HasStdExtZbt]
let Predicates = [HasStdExtZbb] in {
def CLZ : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0010011>, "clz">,
def CTZ : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0010011>, "ctz">,
def PCNT : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "pcnt">,
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbm, IsRV64] in
def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, RISCVOpcode<0b0010011>,
"bmatflip">, Sched<[]>;
let Predicates = [HasStdExtZbb] in {
def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, RISCVOpcode<0b0010011>,
"sext.b">, Sched<[]>;
def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, RISCVOpcode<0b0010011>,
"sext.h">, Sched<[]>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbr] in {
def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, RISCVOpcode<0b0010011>,
"crc32.b">, Sched<[]>;
def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, RISCVOpcode<0b0010011>,
"crc32.h">, Sched<[]>;
def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, RISCVOpcode<0b0010011>,
"crc32.w">, Sched<[]>;
} // Predicates = [HasStdExtZbr]
let Predicates = [HasStdExtZbr, IsRV64] in
def CRC32D : RVBUnary<0b0110000, 0b10011, 0b001, RISCVOpcode<0b0010011>,
"crc32.d">, Sched<[]>;
let Predicates = [HasStdExtZbr] in {
def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, RISCVOpcode<0b0010011>,
"crc32c.b">, Sched<[]>;
def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, RISCVOpcode<0b0010011>,
"crc32c.h">, Sched<[]>;
def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, RISCVOpcode<0b0010011>,
"crc32c.w">, Sched<[]>;
} // Predicates = [HasStdExtZbr]
let Predicates = [HasStdExtZbr, IsRV64] in
def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, RISCVOpcode<0b0010011>,
"crc32c.d">, Sched<[]>;
let Predicates = [HasStdExtZbc] in {
def CLMUL : ALU_rr<0b0000101, 0b001, "clmul">, Sched<[]>;
def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">, Sched<[]>;
def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">, Sched<[]>;
} // Predicates = [HasStdExtZbc]
let Predicates = [HasStdExtZbb] in {
def MIN : ALU_rr<0b0000101, 0b100, "min">, Sched<[]>;
def MAX : ALU_rr<0b0000101, 0b101, "max">, Sched<[]>;
def MINU : ALU_rr<0b0000101, 0b110, "minu">, Sched<[]>;
def MAXU : ALU_rr<0b0000101, 0b111, "maxu">, Sched<[]>;
} // Predicates = [HasStdExtZbb]
let Predicates = [HasStdExtZbp] in {
def SHFL : ALU_rr<0b0000100, 0b001, "shfl">, Sched<[]>;
def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbe] in {
def BDEP : ALU_rr<0b0100100, 0b110, "bdep">, Sched<[]>;
def BEXT : ALU_rr<0b0000100, 0b110, "bext">, Sched<[]>;
} // Predicates = [HasStdExtZbe]
let Predicates = [HasStdExtZbbOrZbp] in {
def PACK : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>;
def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbm, IsRV64] in {
def BMATOR : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>;
def BMATXOR : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>;
} // Predicates = [HasStdExtZbm, IsRV64]
let Predicates = [HasStdExtZbbOrZbp] in
def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
let Predicates = [HasStdExtZbf] in
def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[]>;
let Predicates = [HasStdExtZbp] in {
def SHFLI : RVBShfl_ri<0b000010, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
def UNSHFLI : RVBShfl_ri<0b000010, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
} // Predicates = [HasStdExtZbp]
let Predicates = [HasStdExtZbb, IsRV64] in {
def ADDIWU : RVBALUW_ri<0b100, "addiwu">, Sched<[]>;
def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slliu.w">, Sched<[]>;
def ADDWU : ALUW_rr<0b0000101, 0b000, "addwu">, Sched<[]>;
def SUBWU : ALUW_rr<0b0100101, 0b000, "subwu">, Sched<[]>;
def ADDUW : ALUW_rr<0b0000100, 0b000, "addu.w">, Sched<[]>;
def SUBUW : ALUW_rr<0b0100100, 0b000, "subu.w">, Sched<[]>;
} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbb, IsRV64] in {
def SLOW : ALUW_rr<0b0010000, 0b001, "slow">, Sched<[]>;
def SROW : ALUW_rr<0b0010000, 0b101, "srow">, Sched<[]>;
} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
def ROLW : ALUW_rr<0b0110000, 0b001, "rolw">, Sched<[]>;
def RORW : ALUW_rr<0b0110000, 0b101, "rorw">, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
let Predicates = [HasStdExtZbs, IsRV64] in {
def SBCLRW : ALUW_rr<0b0100100, 0b001, "sbclrw">, Sched<[]>;
def SBSETW : ALUW_rr<0b0010100, 0b001, "sbsetw">, Sched<[]>;
def SBINVW : ALUW_rr<0b0110100, 0b001, "sbinvw">, Sched<[]>;
def SBEXTW : ALUW_rr<0b0100100, 0b101, "sbextw">, Sched<[]>;
} // Predicates = [HasStdExtZbs, IsRV64]
let Predicates = [HasStdExtZbp, IsRV64] in {
def GORCW : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
def GREVW : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbb, IsRV64] in {
def SLOIW : RVBShiftW_ri<0b0010000, 0b001, OPC_OP_IMM_32, "sloiw">, Sched<[]>;
def SROIW : RVBShiftW_ri<0b0010000, 0b101, OPC_OP_IMM_32, "sroiw">, Sched<[]>;
} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, Sched<[]>;
let Predicates = [HasStdExtZbs, IsRV64] in {
def SBCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "sbclriw">,
def SBSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "sbsetiw">,
def SBINVIW : RVBShiftW_ri<0b0110100, 0b001, OPC_OP_IMM_32, "sbinviw">,
} // Predicates = [HasStdExtZbs, IsRV64]
let Predicates = [HasStdExtZbp, IsRV64] in {
def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>;
def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbt, IsRV64] in {
def FSLW : RVBTernaryR<0b10, 0b001, OPC_OP_32,
"fslw", "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
def FSRW : RVBTernaryR<0b10, 0b101, OPC_OP_32, "fsrw",
"$rd, $rs1, $rs3, $rs2">, Sched<[]>;
def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32,
"fsriw", "$rd, $rs1, $rs3, $shamt">, Sched<[]>;
} // Predicates = [HasStdExtZbt, IsRV64]
let Predicates = [HasStdExtZbb, IsRV64] in {
def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0011011>,
"clzw">, Sched<[]>;
def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0011011>,
"ctzw">, Sched<[]>;
def PCNTW : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
"pcntw">, Sched<[]>;
} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbc, IsRV64] in {
def CLMULW : ALUW_rr<0b0000101, 0b001, "clmulw">, Sched<[]>;
def CLMULRW : ALUW_rr<0b0000101, 0b010, "clmulrw">, Sched<[]>;
def CLMULHW : ALUW_rr<0b0000101, 0b011, "clmulhw">, Sched<[]>;
} // Predicates = [HasStdExtZbc, IsRV64]
let Predicates = [HasStdExtZbp, IsRV64] in {
def SHFLW : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>;
def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>;
} // Predicates = [HasStdExtZbp, IsRV64]
let Predicates = [HasStdExtZbe, IsRV64] in {
def BDEPW : ALUW_rr<0b0100100, 0b110, "bdepw">, Sched<[]>;
def BEXTW : ALUW_rr<0b0000100, 0b110, "bextw">, Sched<[]>;
} // Predicates = [HasStdExtZbe, IsRV64]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
def PACKW : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
let Predicates = [HasStdExtZbf, IsRV64] in
def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, Sched<[]>;
// Future compressed instructions
// The presence of these instructions in the B extension is purely experimental
// and they should be moved to the C extension as soon as they are ratified.
let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
class RVBInstC<bits<2> funct2, string opcodestr>
: RVInst16<(outs GPRC:$rs_wb), (ins GPRC:$rs), opcodestr, "$rs", [],
InstFormatCR> {
bits<3> rs;
let Constraints = "$rs = $rs_wb";
let Inst{15-12} = 0b0110;
let Inst{11-10} = funct2;
let Inst{9-7} = rs;
let Inst{6-0} = 0b0000001;
// The namespace RVBC exists to avoid encoding conflicts with the compressed
// instructions c.addi16sp and c.lui already implemented in the C extension.
let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC] in {
def C_NOT : RVBInstC<0b00, "c.not">, Sched<[]>;
def C_NEG : RVBInstC<0b01, "c.neg">, Sched<[]>;
} // DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC]
let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in
def C_ZEXTW : RVBInstC<0b10, "c.zext.w">, Sched<[]>;
// Pseudo Instructions
let Predicates = [HasStdExtZbb, IsRV32] in {
def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
def : InstAlias<"zext.h $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
} // Predicates = [HasStdExtZbb, IsRV32]
let Predicates = [HasStdExtZbb, IsRV64] in {
def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
def : InstAlias<"zext.h $rd, $rs", (PACKW GPR:$rd, GPR:$rs, X0)>;
def : InstAlias<"zext.w $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
} // Predicates = [HasStdExtZbb, IsRV64]
let Predicates = [HasStdExtZbbOrZbp] in {
def : InstAlias<"rev.p $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00001)>,
def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>,
def : InstAlias<"rev.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00011)>,
def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>,
def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>,
def : InstAlias<"rev.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00111)>,
def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>,
def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>,
def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>,
def : InstAlias<"rev.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01111)>,
def : InstAlias<"zip.n $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0001)>,
def : InstAlias<"unzip.n $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>,
def : InstAlias<"zip2.b $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0010)>,
def : InstAlias<"unzip2.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0010)>,
def : InstAlias<"zip.b $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0011)>,
def : InstAlias<"unzip.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0011)>,
def : InstAlias<"zip4.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0100)>,
def : InstAlias<"unzip4.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0100)>,
def : InstAlias<"zip2.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0110)>,
def : InstAlias<"unzip2.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0110)>,
def : InstAlias<"zip.h $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b0111)>,
def : InstAlias<"unzip.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0111)>,
def : InstAlias<"orc.p $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00001)>,
def : InstAlias<"orc2.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00010)>,
def : InstAlias<"orc.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00011)>,
def : InstAlias<"orc4.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00100)>,
def : InstAlias<"orc2.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00110)>,
def : InstAlias<"orc.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00111)>,
def : InstAlias<"orc8.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01000)>,
def : InstAlias<"orc4.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01100)>,
def : InstAlias<"orc2.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01110)>,
def : InstAlias<"orc.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01111)>,
} // Predicates = [HasStdExtZbbOrZbp]
let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
def : InstAlias<"rev8 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
def : InstAlias<"rev4 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
def : InstAlias<"rev2 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
def : InstAlias<"rev $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
def : InstAlias<"zip8 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1000)>,
def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1000)>,
def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1100)>,
def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>,
def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1110)>,
def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>,
def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b1111)>,
def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>,
def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
def : InstAlias<"orc4 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
def : InstAlias<"orc2 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
def : InstAlias<"rev16.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b010000)>,
def : InstAlias<"rev8.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011000)>,
def : InstAlias<"rev4.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011100)>,
def : InstAlias<"rev2.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011110)>,
def : InstAlias<"rev.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b011111)>,
def : InstAlias<"rev32 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b100000)>,
def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b110000)>,
def : InstAlias<"rev8 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111000)>,
def : InstAlias<"rev4 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111100)>,
def : InstAlias<"rev2 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111110)>,
def : InstAlias<"rev $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b111111)>,
def : InstAlias<"zip8.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01000)>,
def : InstAlias<"unzip8.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01000)>,
def : InstAlias<"zip4.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01100)>,
def : InstAlias<"unzip4.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01100)>,
def : InstAlias<"zip2.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01110)>,
def : InstAlias<"unzip2.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01110)>,
def : InstAlias<"zip.w $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b01111)>,
def : InstAlias<"unzip.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01111)>,
def : InstAlias<"zip16 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b10000)>,
def : InstAlias<"unzip16 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b10000)>,
def : InstAlias<"zip8 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11000)>,
def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11000)>,
def : InstAlias<"zip4 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11100)>,
def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11100)>,
def : InstAlias<"zip2 $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11110)>,
def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11110)>,
def : InstAlias<"zip $rd, $rs", (SHFLI GPR:$rd, GPR:$rs, 0b11111)>,
def : InstAlias<"unzip $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b11111)>,
def : InstAlias<"orc16.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b010000)>,
def : InstAlias<"orc8.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011000)>,
def : InstAlias<"orc4.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011100)>,
def : InstAlias<"orc2.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011110)>,
def : InstAlias<"orc.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b011111)>,
def : InstAlias<"orc32 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b100000)>,
def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b110000)>,
def : InstAlias<"orc8 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111000)>,
def : InstAlias<"orc4 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111100)>,
def : InstAlias<"orc2 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111110)>,
def : InstAlias<"orc $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b111111)>,
} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
// Compressed Instruction patterns
let Predicates = [HasStdExtZbproposedc, HasStdExtC] in {
def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1),
(C_NOT GPRC:$rs1)>;
def : CompressPat<(SUB GPRC:$rs1, X0, GPRC:$rs1),
(C_NEG GPRC:$rs1)>;
} // Predicates = [HasStdExtZbproposedc, HasStdExtC]
let Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in {
def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0),
(C_ZEXTW GPRC:$rs1)>;
} // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64]
+// Codegen patterns
+def SLOIPat : ComplexPattern<XLenVT, 2, "SelectSLOI", [or]>;
+def SROIPat : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>;
+def RORIPat : ComplexPattern<XLenVT, 2, "SelectRORI", [rotl]>;
+def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
+def SLOIWPat : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
+def SROIWPat : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
+def RORIWPat : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>;
+def FSRIWPat : ComplexPattern<i64, 3, "SelectFSRIW", [sext_inreg]>;
+let Predicates = [HasStdExtZbbOrZbp] in {
+def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or GPR:$rs1, (not GPR:$rs2)), (ORN GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbbOrZbp]
+let Predicates = [HasStdExtZbb] in {
+def : Pat<(xor (shl (xor GPR:$rs1, -1), GPR:$rs2), -1),
+ (SLO GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (srl (xor GPR:$rs1, -1), GPR:$rs2), -1),
+ (SRO GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbb]
+let Predicates = [HasStdExtZbbOrZbp] in {
+def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
+def : Pat<(fshl GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
+def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
+def : Pat<(fshr GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbbOrZbp]
+let Predicates = [HasStdExtZbs, IsRV32] in
+def : Pat<(and (xor (shl 1, (and GPR:$rs2, 31)), -1), GPR:$rs1),
+ (SBCLR GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV64] in
+def : Pat<(and (xor (shl 1, (and GPR:$rs2, 63)), -1), GPR:$rs1),
+ (SBCLR GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs] in
+def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (SBCLR GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV32] in
+def : Pat<(or (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
+ (SBSET GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV64] in
+def : Pat<(or (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
+ (SBSET GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV32] in
+def : Pat<(xor (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
+ (SBINV GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV64] in
+def : Pat<(xor (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
+ (SBINV GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV32] in
+def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 31)), 1),
+ (SBEXT GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV64] in
+def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 63)), 1),
+ (SBEXT GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbb] in {
+def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
+ (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt),
+ (SROI GPR:$rs1, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbb]
+// There's no encoding for roli in the current version of the 'B' extension
+// (v0.92) as it can be implemented with rori by negating the immediate.
+// For this reason we pattern-match only against rori[w].
+let Predicates = [HasStdExtZbbOrZbp] in
+def : Pat<(RORIPat GPR:$rs1, uimmlog2xlen:$shamt),
+ (RORI GPR:$rs1, uimmlog2xlen:$shamt)>;
+// We don't pattern-match sbclri[w], sbseti[w], sbinvi[w] because they are
+// pattern-matched by simple andi, ori, and xori.
+let Predicates = [HasStdExtZbs] in
+def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
+ (SBEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1),
+ (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))),
+ (GORCI GPR:$rs1, (i32 1))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333)), GPR:$rs1),
+ (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC))),
+ (GORCI GPR:$rs1, (i32 2))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F)), GPR:$rs1),
+ (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0))),
+ (GORCI GPR:$rs1, (i32 4))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF)), GPR:$rs1),
+ (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00))),
+ (GORCI GPR:$rs1, (i32 8))>;
+def : Pat<(or (or (srl GPR:$rs1, (i32 16)), GPR:$rs1),
+ (shl GPR:$rs1, (i32 16))),
+ (GORCI GPR:$rs1, (i32 16))>;
+} // Predicates = [HasStdExtZbp, IsRV32]
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA))),
+ (GORCI GPR:$rs1, (i64 1))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC))),
+ (GORCI GPR:$rs1, (i64 2))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0))),
+ (GORCI GPR:$rs1, (i64 4))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00))),
+ (GORCI GPR:$rs1, (i64 8))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000))),
+ (GORCI GPR:$rs1, (i64 16))>;
+def : Pat<(or (or (srl GPR:$rs1, (i64 32)), GPR:$rs1),
+ (shl GPR:$rs1, (i64 32))),
+ (GORCI GPR:$rs1, (i64 32))>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : Pat<(or (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA)),
+ (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555))),
+ (GREVI GPR:$rs1, (i32 1))>;
+def : Pat<(or (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC)),
+ (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333))),
+ (GREVI GPR:$rs1, (i32 2))>;
+def : Pat<(or (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0)),
+ (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F))),
+ (GREVI GPR:$rs1, (i32 4))>;
+def : Pat<(or (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00)),
+ (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF))),
+ (GREVI GPR:$rs1, (i32 8))>;
+def : Pat<(rotr (bswap GPR:$rs1), (i32 16)), (GREVI GPR:$rs1, (i32 8))>;
+def : Pat<(or (shl GPR:$rs1, (i32 16)), (srl GPR:$rs1, (i32 16))),
+ (GREVI GPR:$rs1, (i32 16))>;
+def : Pat<(rotl GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>;
+def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i32 24))>;
+def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i32 31))>;
+} // Predicates = [HasStdExtZbp, IsRV32]
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA)),
+ (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555))),
+ (GREVI GPR:$rs1, (i64 1))>;
+def : Pat<(or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC)),
+ (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333))),
+ (GREVI GPR:$rs1, (i64 2))>;
+def : Pat<(or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0)),
+ (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F))),
+ (GREVI GPR:$rs1, (i64 4))>;
+def : Pat<(or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00)),
+ (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF))),
+ (GREVI GPR:$rs1, (i64 8))>;
+def : Pat<(or (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000)),
+ (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF))),
+ (GREVI GPR:$rs1, (i64 16))>;
+def : Pat<(or (shl GPR:$rs1, (i64 32)), (srl GPR:$rs1, (i64 32))),
+ (GREVI GPR:$rs1, (i64 32))>;
+def : Pat<(rotl GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>;
+def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>;
+def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+let Predicates = [HasStdExtZbt] in {
+def : Pat<(or (and (xor GPR:$rs2, -1), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)),
+ (CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(riscv_selectcc GPR:$rs2, (XLenVT 0), (XLenVT 17), GPR:$rs3, GPR:$rs1),
+ (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(fshl GPR:$rs1, GPR:$rs2, GPR:$rs3),
+ (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(fshr GPR:$rs1, GPR:$rs2, GPR:$rs3),
+ (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(fshr GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
+ (FSRI GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbt]
+let Predicates = [HasStdExtZbb] in {
+def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>;
+def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>;
+def : Pat<(ctpop GPR:$rs1), (PCNT GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb]
+let Predicates = [HasStdExtZbb, IsRV32] in
+def : Pat<(sra (shl GPR:$rs1, (i32 24)), (i32 24)), (SEXTB GPR:$rs1)>;
+let Predicates = [HasStdExtZbb, IsRV64] in
+def : Pat<(sra (shl GPR:$rs1, (i64 56)), (i64 56)), (SEXTB GPR:$rs1)>;
+let Predicates = [HasStdExtZbb, IsRV32] in
+def : Pat<(sra (shl GPR:$rs1, (i32 16)), (i32 16)), (SEXTH GPR:$rs1)>;
+let Predicates = [HasStdExtZbb, IsRV64] in
+def : Pat<(sra (shl GPR:$rs1, (i64 48)), (i64 48)), (SEXTH GPR:$rs1)>;
+let Predicates = [HasStdExtZbb] in {
+def : Pat<(smin GPR:$rs1, GPR:$rs2), (MIN GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 20), GPR:$rs1, GPR:$rs2),
+ (MIN GPR:$rs1, GPR:$rs2)>;
+def : Pat<(smax GPR:$rs1, GPR:$rs2), (MAX GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 20), GPR:$rs1, GPR:$rs2),
+ (MAX GPR:$rs1, GPR:$rs2)>;
+def : Pat<(umin GPR:$rs1, GPR:$rs2), (MINU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 12), GPR:$rs1, GPR:$rs2),
+ (MINU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(umax GPR:$rs1, GPR:$rs2), (MAXU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2),
+ (MAXU GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbb]
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+def : Pat<(or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16))),
+ (PACK GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+def : Pat<(or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32))),
+ (PACK GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+def : Pat<(or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16))),
+ (PACKU GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+def : Pat<(or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32))),
+ (PACKU GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp] in
+def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFF00),
+ (and GPR:$rs1, 0x00FF)),
+ (PACKH GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
+ (and GPR:$rs1, (i32 0xFF0000FF))),
+ (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))),
+ (SHFLI GPR:$rs1, (i32 8))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)),
+ (and GPR:$rs1, (i32 0xF00FF00F))),
+ (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))),
+ (SHFLI GPR:$rs1, (i32 4))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)),
+ (and GPR:$rs1, (i32 0xC3C3C3C3))),
+ (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))),
+ (SHFLI GPR:$rs1, (i32 2))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)),
+ (and GPR:$rs1, (i32 0x99999999))),
+ (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))),
+ (SHFLI GPR:$rs1, (i32 1))>;
+} // Predicates = [HasStdExtZbp, IsRV32]
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)),
+ (and GPR:$rs1, (i64 0xFFFF00000000FFFF))),
+ (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))),
+ (SHFLI GPR:$rs1, (i64 16))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)),
+ (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))),
+ (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))),
+ (SHFLI GPR:$rs1, (i64 8))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)),
+ (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))),
+ (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))),
+ (SHFLI GPR:$rs1, (i64 4))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)),
+ (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))),
+ (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))),
+ (SHFLI GPR:$rs1, (i64 2))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)),
+ (and GPR:$rs1, (i64 0x9999999999999999))),
+ (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))),
+ (SHFLI GPR:$rs1, (i64 1))>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : Pat<(and (add GPR:$rs, simm12:$simm12), (i64 0xFFFFFFFF)),
+ (ADDIWU GPR:$rs, simm12:$simm12)>;
+def : Pat<(SLLIUWPat GPR:$rs1, uimmlog2xlen:$shamt),
+ (SLLIUW GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(and (add GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
+ (ADDWU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (sub GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
+ (SUBWU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
+ (ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sub GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
+ (SUBUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (riscv_sllw (xor GPR:$rs1, -1), GPR:$rs2), -1),
+ (SLOW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (riscv_srlw (xor GPR:$rs1, -1), GPR:$rs2), -1),
+ (SROW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
+ (riscv_srlw (assertsexti32 GPR:$rs1),
+ (sub (i64 0), (assertsexti32 GPR:$rs2)))),
+ (ROLW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1),
+ (sub (i64 0), (assertsexti32 GPR:$rs2))),
+ (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2))),
+ (RORW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+let Predicates = [HasStdExtZbs, IsRV64] in {
+def : Pat<(and (xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), -1),
+ (assertsexti32 GPR:$rs1)),
+ (SBCLRW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
+ (assertsexti32 GPR:$rs1)),
+ (SBSETW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
+ (assertsexti32 GPR:$rs1)),
+ (SBINVW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
+ 1),
+ (SBEXTW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbs, IsRV64]
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt),
+ (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt),
+ (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+def : Pat<(RORIWPat GPR:$rs1, uimmlog2xlen:$shamt),
+ (RORIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA))),
+ i32),
+ (GORCIW GPR:$rs1, (i64 1))>;
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC))),
+ i32),
+ (GORCIW GPR:$rs1, (i64 2))>;
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0))),
+ i32),
+ (GORCIW GPR:$rs1, (i64 4))>;
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00))),
+ i32),
+ (GORCIW GPR:$rs1, (i64 8))>;
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF)),
+ GPR:$rs1),
+ (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000))),
+ i32),
+ (GORCIW GPR:$rs1, (i64 16))>;
+def : Pat<(sext_inreg (or (or (srl (and GPR:$rs1, (i64 0xFFFF0000)), (i64 16)),
+ GPR:$rs1),
+ (shl GPR:$rs1, (i64 16))), i32),
+ (GORCIW GPR:$rs1, (i64 16))>;
+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA)),
+ (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555))),
+ i32),
+ (GREVIW GPR:$rs1, (i64 1))>;
+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC)),
+ (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333))),
+ i32),
+ (GREVIW GPR:$rs1, (i64 2))>;
+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0)),
+ (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F))),
+ i32),
+ (GREVIW GPR:$rs1, (i64 4))>;
+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00)),
+ (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF))),
+ i32),
+ (GREVIW GPR:$rs1, (i64 8))>;
+def : Pat<(sext_inreg (or (shl GPR:$rs1, (i64 16)),
+ (srl (and GPR:$rs1, 0xFFFF0000), (i64 16))), i32),
+ (GREVIW GPR:$rs1, (i64 16))>;
+def : Pat<(sra (bswap GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 24))>;
+def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+let Predicates = [HasStdExtZbt, IsRV64] in {
+def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
+ (i64 0),
+ (i64 17),
+ (assertsexti32 GPR:$rs1),
+ (or (riscv_sllw (assertsexti32 GPR:$rs1),
+ (and (assertsexti32 GPR:$rs3), 31)),
+ (riscv_srlw (assertsexti32 GPR:$rs2),
+ (sub (i64 32),
+ (assertsexti32 GPR:$rs3))))),
+ (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
+ (i64 0),
+ (i64 17),
+ (assertsexti32 GPR:$rs2),
+ (or (riscv_sllw (assertsexti32 GPR:$rs1),
+ (sub (i64 32),
+ (assertsexti32 GPR:$rs3))),
+ (riscv_srlw (assertsexti32 GPR:$rs2),
+ (and (assertsexti32 GPR:$rs3), 31)))),
+ (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(FSRIWPat GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
+ (FSRIW GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbt, IsRV64]
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
+ (CLZW GPR:$rs1)>;
+// We don't pattern-match CTZW here as it has the same pattern and result as
+// RV64 CTZ
+def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (PCNTW GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def : Pat<(sext_inreg (or (shl (assertsexti32 GPR:$rs2), (i64 16)),
+ (and (assertsexti32 GPR:$rs1), 0x000000000000FFFF)),
+ i32),
+ (PACKW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
+ (srl (and (assertsexti32 GPR:$rs1), 0x00000000FFFF0000),
+ (i64 16))),
+ (PACKUW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f8b6b7eb3aff..86aa85e965f6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1,50203 +1,50254 @@
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file defines the interfaces that X86 uses to lower LLVM code into a
// selection DAG.
#include "X86ISelLowering.h"
#include "MCTargetDesc/X86ShuffleDecode.h"
#include "X86.h"
#include "X86CallingConv.h"
#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86IntrinsicsInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
#include <cctype>
#include <numeric>
using namespace llvm;
#define DEBUG_TYPE "x86-isel"
STATISTIC(NumTailCalls, "Number of tail calls");
static cl::opt<int> ExperimentalPrefLoopAlignment(
"x86-experimental-pref-loop-alignment", cl::init(4),
"Sets the preferable loop alignment for experiments (as log2 bytes)"
"(the last x86-experimental-pref-loop-alignment bits"
" of the loop header PC will be 0)."),
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
"SHIFT, LEA, etc."),
static cl::opt<bool> ExperimentalUnorderedISEL(
"x86-experimental-unordered-atomic-isel", cl::init(false),
cl::desc("Use LoadSDNode and StoreSDNode instead of "
"AtomicSDNode for unordered atomic loads and "
"stores respectively."),
/// Call this when the user attempts to do something unsupported, like
/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
/// report_fatal_error, so calling code should attempt to recover without
/// crashing.
static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
const char *Msg) {
MachineFunction &MF = DAG.getMachineFunction();
DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
X86ScalarSSEf64 = Subtarget.hasSSE2();
X86ScalarSSEf32 = Subtarget.hasSSE1();
MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
// Set up the TargetLowering object.
// X86 is weird. It always uses i8 for shift amounts and setcc results.
// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
// For 64-bit, since we have so many registers, use the ILP scheduler.
// For 32-bit, use the register pressure specific scheduling.
// For Atom, always use ILP scheduling.
if (Subtarget.isAtom())
else if (Subtarget.is64Bit())
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
// Bypass expensive divides and use cheaper ones.
if (TM.getOptLevel() >= CodeGenOpt::Default) {
if (Subtarget.hasSlowDivide32())
addBypassSlowDiv(32, 8);
if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
addBypassSlowDiv(64, 32);
if (Subtarget.isTargetWindowsMSVC() ||
Subtarget.isTargetWindowsItanium()) {
// Setup Windows compiler runtime calls.
setLibcallName(RTLIB::SDIV_I64, "_alldiv");
setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
setLibcallName(RTLIB::SREM_I64, "_allrem");
setLibcallName(RTLIB::UREM_I64, "_aullrem");
setLibcallName(RTLIB::MUL_I64, "_allmul");
setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
if (Subtarget.getTargetTriple().isOSMSVCRT()) {
// MSVCRT doesn't have powi; fall back to pow
setLibcallName(RTLIB::POWI_F32, nullptr);
setLibcallName(RTLIB::POWI_F64, nullptr);
// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
// FIXME: Should we be limiting the atomic size on other configs? Default is
// 1024.
if (!Subtarget.hasCmpxchg8b())
// Set up the register classes.
addRegisterClass(MVT::i8, &X86::GR8RegClass);
addRegisterClass(MVT::i16, &X86::GR16RegClass);
addRegisterClass(MVT::i32, &X86::GR32RegClass);
if (Subtarget.is64Bit())
addRegisterClass(MVT::i64, &X86::GR64RegClass);
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
// We don't accept any truncstore of integer registers.
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
setTruncStoreAction(MVT::i32, MVT::i16, Expand);
setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
setTruncStoreAction(MVT::i16, MVT::i8, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
// SETOEQ and SETUNE require checking two conditions.
for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
setCondCodeAction(ISD::SETOEQ, VT, Expand);
setCondCodeAction(ISD::SETUNE, VT, Expand);
// Integer absolute.
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
setOperationAction(ISD::ABS , MVT::i32 , Custom);
setOperationAction(ISD::ABS , MVT::i64 , Custom);
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
// For slow shld targets we only lower for code size.
LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
setOperationAction(ShiftOp , MVT::i8 , Custom);
setOperationAction(ShiftOp , MVT::i16 , Custom);
setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
if (Subtarget.is64Bit())
setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
if (!Subtarget.useSoftFloat()) {
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
// operation.
setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
// We have an algorithm for SSE2, and we turn this into a 64-bit
// FILD or VCVTUSI2SS/SD for other targets.
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
// this operation.
setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
// SSE has no i16 to fp conversion, only i32. We promote in the handler
// to allow f80 to use i16 and f64 to use i16 with sse1 only
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
// this operation.
setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
// Handle FP_TO_UINT by promoting the destination to a larger signed
// conversion.
setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::LRINT, MVT::f32, Custom);
setOperationAction(ISD::LRINT, MVT::f64, Custom);
setOperationAction(ISD::LLRINT, MVT::f32, Custom);
setOperationAction(ISD::LLRINT, MVT::f64, Custom);
if (!Subtarget.is64Bit()) {
setOperationAction(ISD::LRINT, MVT::i64, Custom);
setOperationAction(ISD::LLRINT, MVT::i64, Custom);
// Handle address space casts between mixed sized pointers.
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!X86ScalarSSEf64) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
// Without SSE, i64->f64 goes through memory.
setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
} else if (!Subtarget.is64Bit())
setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
// Scalar integer divide and remainder are lowered to use operations that
// produce two results, to match the available instructions. This exposes
// the two-result form to trivial CSE, which is able to combine x/y and x%y
// into a single instruction.
// Scalar integer multiply-high is also lowered to use two-result
// operations, to match the available instructions. However, plain multiply
// (low) operations are left as Legal, as there are single-result
// instructions for this in x86. Using the two-result multiply instructions
// when both high and low results are needed must be arranged by dagcombine.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::BR_JT , MVT::Other, Expand);
setOperationAction(ISD::BRCOND , MVT::Other, Custom);
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::BR_CC, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
setOperationAction(ISD::FREM , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f64 , Expand);
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FREM , MVT::f128 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
// Promote the i8 variants and force them on up to i32 which has a shorter
// encoding.
setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
if (!Subtarget.hasBMI()) {
setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
if (Subtarget.is64Bit()) {
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
if (Subtarget.hasLZCNT()) {
// When promoting the i8 variants, force them to i32 for a shorter
// encoding.
setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
} else {
for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
setOperationAction(ISD::CTLZ , VT, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
Op, MVT::f32,
(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
// There's never any support for operations beyond MVT::f32.
setOperationAction(Op, MVT::f64, Expand);
setOperationAction(Op, MVT::f80, Expand);
setOperationAction(Op, MVT::f128, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
if (Subtarget.hasPOPCNT()) {
setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
} else {
setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
if (!Subtarget.hasMOVBE())
setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
// X86 wants to expand cmov itself.
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
// Custom action for SELECT MMX and expand action for SELECT_CC MMX
setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
// Darwin ABI issue.
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
setOperationAction(ISD::ConstantPool , VT, Custom);
setOperationAction(ISD::JumpTable , VT, Custom);
setOperationAction(ISD::GlobalAddress , VT, Custom);
setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
setOperationAction(ISD::ExternalSymbol , VT, Custom);
setOperationAction(ISD::BlockAddress , VT, Custom);
// 64-bit shl, sra, srl (iff 32-bit x86)
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
setOperationAction(ISD::SHL_PARTS, VT, Custom);
setOperationAction(ISD::SRA_PARTS, VT, Custom);
setOperationAction(ISD::SRL_PARTS, VT, Custom);
if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
// Expand certain atomics
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
if (!Subtarget.is64Bit())
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
if (Subtarget.hasCmpxchg16b()) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
// FIXME - use subtarget debug flags
if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
setOperationAction(ISD::VAEND , MVT::Other, Expand);
bool Is64Bit = Subtarget.is64Bit();
setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
: &X86::FR32RegClass);
addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
: &X86::FR64RegClass);
// Disable f32->f64 extload as we can only generate this in one instruction
// under optsize. So its easier to pattern match (fpext (load)) for that
// case instead of needing to emit 2 instructions for extload in the
// non-optsize case.
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
for (auto VT : { MVT::f32, MVT::f64 }) {
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS, VT, Custom);
// Use XORP to simulate FNEG.
setOperationAction(ISD::FNEG, VT, Custom);
// Use ANDPD and ORPD to simulate FCOPYSIGN.
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
// These might be better off as horizontal vector ops.
setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::FSUB, VT, Custom);
// We don't support sin/cos/fmod
setOperationAction(ISD::FSIN , VT, Expand);
setOperationAction(ISD::FCOS , VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
// Lower this to MOVMSK plus an AND.
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
} else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
(UseX87 || Is64Bit)) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
if (UseX87)
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
// Use ANDPS to simulate FABS.
setOperationAction(ISD::FABS , MVT::f32, Custom);
// Use XORP to simulate FNEG.
setOperationAction(ISD::FNEG , MVT::f32, Custom);
if (UseX87)
setOperationAction(ISD::UNDEF, MVT::f64, Expand);
// Use ANDPS and ORPS to simulate FCOPYSIGN.
if (UseX87)
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
// We don't support sin/cos/fmod
setOperationAction(ISD::FSIN , MVT::f32, Expand);
setOperationAction(ISD::FCOS , MVT::f32, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
if (UseX87) {
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
} else if (UseX87) {
// f32 and f64 in x87.
// Set up the FP register classes.
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
addRegisterClass(MVT::f32, &X86::RFP32RegClass);
for (auto VT : { MVT::f32, MVT::f64 }) {
setOperationAction(ISD::UNDEF, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN , VT, Expand);
setOperationAction(ISD::FCOS , VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
// Expand FP32 immediates into loads from the stack, save special cases.
if (isTypeLegal(MVT::f32)) {
if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
addLegalFPImmediate(APFloat(+0.0f)); // FLD0
addLegalFPImmediate(APFloat(+1.0f)); // FLD1
addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0f)); // xorps
// Expand FP64 immediates into loads from the stack, save special cases.
if (isTypeLegal(MVT::f64)) {
if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
addLegalFPImmediate(APFloat(+0.0)); // FLD0
addLegalFPImmediate(APFloat(+1.0)); // FLD1
addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
// Handle constrained floating-point operations of scalar.
setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
// We don't support FMA.
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
// f80 always uses X87.
if (UseX87) {
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
addLegalFPImmediate(TmpFlt); // FLD0
addLegalFPImmediate(TmpFlt); // FLD0/FCHS
bool ignored;
APFloat TmpFlt2(+1.0);
TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
addLegalFPImmediate(TmpFlt2); // FLD1
addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
// Always expand sin/cos functions even though x87 has an instruction.
setOperationAction(ISD::FSIN , MVT::f80, Expand);
setOperationAction(ISD::FCOS , MVT::f80, Expand);
setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
setOperationAction(ISD::FCEIL, MVT::f80, Expand);
setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
setOperationAction(ISD::FRINT, MVT::f80, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
setOperationAction(ISD::FMA, MVT::f80, Expand);
setOperationAction(ISD::LROUND, MVT::f80, Expand);
setOperationAction(ISD::LLROUND, MVT::f80, Expand);
setOperationAction(ISD::LRINT, MVT::f80, Custom);
setOperationAction(ISD::LLRINT, MVT::f80, Custom);
// Handle constrained floating-point operations of scalar.
setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
// as Custom.
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
// f128 uses xmm registers, but most operations require libcalls.
if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
setOperationAction(ISD::FADD, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
setOperationAction(ISD::FSUB, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
setOperationAction(ISD::FDIV, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
setOperationAction(ISD::FMUL, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
setOperationAction(ISD::FMA, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
setOperationAction(ISD::FABS, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
setOperationAction(ISD::FSIN, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
setOperationAction(ISD::FCOS, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
// We need to custom handle any FP_ROUND with an f128 input, but
// LegalizeDAG uses the result type to know when to run a custom handler.
// So we have to list all legal floating point result types here.
if (isTypeLegal(MVT::f32)) {
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
if (isTypeLegal(MVT::f64)) {
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
if (isTypeLegal(MVT::f80)) {
setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
setOperationAction(ISD::SETCC, MVT::f128, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
setTruncStoreAction(MVT::f128, MVT::f80, Expand);
// Always use a library call for pow.
setOperationAction(ISD::FPOW , MVT::f32 , Expand);
setOperationAction(ISD::FPOW , MVT::f64 , Expand);
setOperationAction(ISD::FPOW , MVT::f80 , Expand);
setOperationAction(ISD::FPOW , MVT::f128 , Expand);
setOperationAction(ISD::FLOG, MVT::f80, Expand);
setOperationAction(ISD::FLOG2, MVT::f80, Expand);
setOperationAction(ISD::FLOG10, MVT::f80, Expand);
setOperationAction(ISD::FEXP, MVT::f80, Expand);
setOperationAction(ISD::FEXP2, MVT::f80, Expand);
setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
// Some FP actions are always expanded for vector types.
for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
setOperationAction(ISD::FMA, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FCEIL, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
setOperationAction(ISD::FRINT, VT, Expand);
setOperationAction(ISD::FNEARBYINT, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::SETCC, VT, Expand);
setOperationAction(ISD::FP_TO_UINT, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
setOperationAction(ISD::TRUNCATE, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(InnerVT, VT, Expand);
setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
// types, we have to deal with them whether we ask for Expansion or not.
// Setting Expand causes its own optimisation problems though, so leave
// them legal.
if (VT.getVectorElementType() == MVT::i1)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
// split/scalarized right now.
if (VT.getVectorElementType() == MVT::f16)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// with -msoft-float, disable use of MMX as well.
if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
// No operations on x86mmx supported, everything uses intrinsics.
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
setOperationAction(ISD::FABS, MVT::v4f32, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
// registers cannot be used even for integer operations.
addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::SREM, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);
setOperationAction(ISD::UREM, VT, Custom);
setOperationAction(ISD::MUL, MVT::v2i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i8, Custom);
setOperationAction(ISD::MUL, MVT::v8i8, Custom);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ABS, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
if (VT == MVT::v2i64 && !Subtarget.is64Bit())
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
// Custom lower v2i64 and v2f64 selects.
setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
// In the customized shift lowering, the legal v4i32/v2i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
// With 512-bit registers or AVX512VL+BW, expanding (and promoting the
// shifts) is better.
if (!Subtarget.useAVX512Regs() &&
!(Subtarget.hasBWI() && Subtarget.hasVLX()))
setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
setOperationAction(ISD::ABS, MVT::v16i8, Legal);
setOperationAction(ISD::ABS, MVT::v8i16, Legal);
setOperationAction(ISD::ABS, MVT::v4i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
// These might be better off as horizontal vector ops.
setOperationAction(ISD::ADD, MVT::i16, Custom);
setOperationAction(ISD::ADD, MVT::i32, Custom);
setOperationAction(ISD::SUB, MVT::i16, Custom);
setOperationAction(ISD::SUB, MVT::i32, Custom);
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
setOperationAction(ISD::FCEIL, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
setOperationAction(ISD::FRINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
setOperationAction(ISD::FROUND, RoundedTy, Custom);
setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
// We directly match byte blends in the backend as they match the VSELECT
// condition form.
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
// i8 vectors are custom because the source register and source
// source memory operand types are not the same width.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
// do the pre and post work in the vector domain.
setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
// We need to mark SINT_TO_FP as Custom even though we want to expand it
// so that DAG combine doesn't try to turn it into uint_to_fp.
setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
setOperationAction(ISD::ROTL, VT, Custom);
// XOP can efficiently perform BITREVERSE with VPPERM.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
setOperationAction(ISD::BITREVERSE, VT, Custom);
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
setOperationAction(ISD::BITREVERSE, VT, Custom);
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
bool HasInt256 = Subtarget.hasInt256();
addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::FROUND, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
// In the customized shift lowering, the legal v8i32/v4i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
// These types need custom splitting if their input is a 128-bit vector.
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
// With BWI, expanding (and promoting the shifts) is the better.
if (!Subtarget.useBWIRegs())
setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
if (Subtarget.hasAnyFMA()) {
for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
MVT::v2f64, MVT::v4f64 }) {
setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v4i64, Custom);
setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v32i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
setOperationAction(ISD::ABS, MVT::v4i64, Custom);
setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
if (HasInt256) {
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
// Extract subvector is special because the value type
// (result) is 128-bit but the source is 256-bit wide.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v4f32, MVT::v2f64 }) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
// Custom lower several nodes for 256-bit types.
for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
if (HasInt256) {
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MGATHER, VT, Custom);
// This block controls legalization of the mask vector sizes that are
// available with AVX512. 512-bit vectors are in a separate block controlled
// by useAVX512Regs.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
// There is no byte sized k-register load or store without AVX512DQ.
if (!Subtarget.hasDQI()) {
setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
setOperationAction(ISD::STORE, MVT::v1i1, Custom);
setOperationAction(ISD::STORE, MVT::v2i1, Custom);
setOperationAction(ISD::STORE, MVT::v4i1, Custom);
setOperationAction(ISD::STORE, MVT::v8i1, Custom);
// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::UADDSAT, VT, Custom);
setOperationAction(ISD::SADDSAT, VT, Custom);
setOperationAction(ISD::USUBSAT, VT, Custom);
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// This block controls legalization for 512-bit operations with 32/64 bit
// elements. 512-bits can be disabled based on prefer-vector-width and
// required-vector-width function attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
bool HasBWI = Subtarget.hasBWI();
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
if (HasBWI)
setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
if (HasBWI)
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
// to 512-bit rather than use the AVX2 instructions so that we can use
// k-masks.
if (!Subtarget.hasVLX()) {
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
if (HasBWI) {
// Extends from v64i1 masks to 512-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::FROUND, VT, Custom);
for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v64i8, Custom);
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
setCondCodeAction(ISD::SETLT, VT, Custom);
setCondCodeAction(ISD::SETLE, VT, Custom);
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
if (Subtarget.hasDQI()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
if (Subtarget.hasCDI()) {
// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
setOperationAction(ISD::CTLZ, VT, Legal);
} // Subtarget.hasCDI()
if (Subtarget.hasVPOPCNTDQ()) {
for (auto VT : { MVT::v16i32, MVT::v8i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
// Extract subvector is special because the value type
// (result) is 256-bit but the source is 512-bit wide.
// 128-bit was made Legal under AVX1.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
MVT::v8f32, MVT::v4f64 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
if (HasBWI) {
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
} else {
setOperationAction(ISD::STORE, MVT::v32i16, Custom);
setOperationAction(ISD::STORE, MVT::v64i8, Custom);
if (Subtarget.hasVBMI2()) {
for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}// useAVX512Regs
// This block controls legalization for operations that don't have
// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
// narrower widths.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
// v2f32 UINT_TO_FP is already custom under SSE2.
assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
"Unexpected operation action!");
// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MSCATTER, VT, Custom);
if (Subtarget.hasDQI()) {
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SINT_TO_FP, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::UINT_TO_FP, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_SINT, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MUL, VT, Legal);
if (Subtarget.hasCDI()) {
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::CTLZ, VT, Legal);
} // Subtarget.hasCDI()
if (Subtarget.hasVPOPCNTDQ()) {
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
// This block control legalization of v32i1/v64i1 which are available with
// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
// useBWIRegs.
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::UADDSAT, VT, Custom);
setOperationAction(ISD::SADDSAT, VT, Custom);
setOperationAction(ISD::USUBSAT, VT, Custom);
setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
for (auto VT : { MVT::v16i1, MVT::v32i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// Extends from v32i1 masks to 256-bit vectors.
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
if (Subtarget.hasBITALG()) {
for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
setOperationAction(ISD::CTPOP, VT, Legal);
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
if (Subtarget.hasBWI()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
if (Subtarget.hasVBMI2()) {
// TODO: Make these legal even without VLX?
for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
if (!Subtarget.is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
// FIXME: We really should do custom legalization for addition and
// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
// than generic legalization for 64-bit multiplication-with-overflow, though.
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
// Add/Sub/Mul with overflow operations are custom lowered.
setOperationAction(ISD::SADDO, VT, Custom);
setOperationAction(ISD::UADDO, VT, Custom);
setOperationAction(ISD::SSUBO, VT, Custom);
setOperationAction(ISD::USUBO, VT, Custom);
setOperationAction(ISD::SMULO, VT, Custom);
setOperationAction(ISD::UMULO, VT, Custom);
// Support carry in as value rather than glue.
setOperationAction(ISD::ADDCARRY, VT, Custom);
setOperationAction(ISD::SUBCARRY, VT, Custom);
setOperationAction(ISD::SETCCCARRY, VT, Custom);
if (!Subtarget.is64Bit()) {
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
setLibcallName(RTLIB::MUL_I128, nullptr);
// Combine sin / cos into _sincos_stret if it is available.
if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
if (Subtarget.isTargetWin64()) {
setOperationAction(ISD::SDIV, MVT::i128, Custom);
setOperationAction(ISD::UDIV, MVT::i128, Custom);
setOperationAction(ISD::SREM, MVT::i128, Custom);
setOperationAction(ISD::UREM, MVT::i128, Custom);
setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
// is. We should promote the value to 64-bits to solve this.
// This is what the CRT headers do - `fmodf` is an inline header
// function casting to f64 and calling `fmod`.
if (Subtarget.is32Bit() &&
(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
if (isOperationExpand(Op, MVT::f32))
setOperationAction(Op, MVT::f32, Promote);
// We have target-specific dag combine patterns for the following nodes:
MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 4;
// TODO: These control memcmp expansion in CGP and could be raised higher, but
// that needs to benchmarked and balanced with the potential use of vector
// load/store types (PR33329, PR33914).
MaxLoadsPerMemcmp = 2;
MaxLoadsPerMemcmpOptSize = 2;
// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
// An out-of-order CPU can speculatively execute past a predictable branch,
// but a conditional move could be stalled by an expensive earlier operation.
PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
EnableExtLdPromotion = true;
// Default to having -disable-strictnode-mutation on
IsStrictFPEnabled = true;
// This has so far only been implemented for 64-bit MachO.
bool X86TargetLowering::useLoadStackGuardNode() const {
return Subtarget.isTargetMachO() && Subtarget.is64Bit();
bool X86TargetLowering::useStackGuardXorFP() const {
// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
const SDLoc &DL) const {
EVT PtrTy = getPointerTy(DAG.getDataLayout());
unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
return SDValue(Node, 0);
X86TargetLowering::getPreferredVectorAction(MVT VT) const {
if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
return TypeSplitVector;
if (VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
static std::pair<MVT, unsigned>
handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
const X86Subtarget &Subtarget) {
// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
// convention is one that uses k registers.
if (NumElts == 2)
return {MVT::v2i64, 1};
if (NumElts == 4)
return {MVT::v4i32, 1};
if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
CC != CallingConv::Intel_OCL_BI)
return {MVT::v8i16, 1};
if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
CC != CallingConv::Intel_OCL_BI)
return {MVT::v16i8, 1};
// v32i1 passes in ymm unless we have BWI and the calling convention is
// regcall.
if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
return {MVT::v32i8, 1};
// Split v64i1 vectors if we don't have v64i8 available.
if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
if (Subtarget.useAVX512Regs())
return {MVT::v64i8, 1};
return {MVT::v32i8, 2};
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
NumElts > 64)
return {MVT::i8, NumElts};
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512()) {
unsigned NumElts = VT.getVectorNumElements();
MVT RegisterVT;
unsigned NumRegisters;
std::tie(RegisterVT, NumRegisters) =
handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
return RegisterVT;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512()) {
unsigned NumElts = VT.getVectorNumElements();
MVT RegisterVT;
unsigned NumRegisters;
std::tie(RegisterVT, NumRegisters) =
handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
return NumRegisters;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
VT.getVectorNumElements() > 64)) {
RegisterVT = MVT::i8;
IntermediateVT = MVT::i1;
NumIntermediates = VT.getVectorNumElements();
return NumIntermediates;
// Split v64i1 vectors if we don't have v64i8 available.
if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
CC != CallingConv::X86_RegCall) {
RegisterVT = MVT::v32i8;
IntermediateVT = MVT::v32i1;
NumIntermediates = 2;
return 2;
return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
NumIntermediates, RegisterVT);
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
if (!VT.isVector())
return MVT::i8;
if (Subtarget.hasAVX512()) {
const unsigned NumElts = VT.getVectorNumElements();
// Figure out what this type will be legalized to.
EVT LegalVT = VT;
while (getTypeAction(Context, LegalVT) != TypeLegal)
LegalVT = getTypeToTransformTo(Context, LegalVT);
// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
if (LegalVT.getSimpleVT().is512BitVector())
return EVT::getVectorVT(Context, MVT::i1, NumElts);
if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
// If we legalized to less than a 512-bit vector, then we will use a vXi1
// compare for vXi32/vXi64 for sure. If we have BWI we will also support
// vXi16/vXi8.
MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
return EVT::getVectorVT(Context, MVT::i1, NumElts);
return VT.changeVectorElementTypeToInteger();
/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
if (MaxAlign == 16)
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
MaxAlign = Align(16);
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
Align EltAlign;
getMaxByValAlign(ATy->getElementType(), EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (auto *EltTy : STy->elements()) {
Align EltAlign;
getMaxByValAlign(EltTy, EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
if (MaxAlign == 16)
/// Return the desired alignment for ByVal aggregate
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const {
if (Subtarget.is64Bit()) {
// Max of 8 and alignment of type.
Align TyAlign = DL.getABITypeAlign(Ty);
if (TyAlign > 8)
return TyAlign.value();
return 8;
Align Alignment(4);
if (Subtarget.hasSSE1())
getMaxByValAlign(Ty, Alignment);
return Alignment.value();
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
/// For vector ops we check that the overall size isn't larger than our
/// preferred vector width.
EVT X86TargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {
if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Op.size() >= 16 &&
(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
// FIXME: Check if unaligned 64-byte accesses are slow.
if (Op.size() >= 64 && Subtarget.hasAVX512() &&
(Subtarget.getPreferVectorWidth() >= 512)) {
return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
// FIXME: Check if unaligned 32-byte accesses are slow.
if (Op.size() >= 32 && Subtarget.hasAVX() &&
(Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
// choose an optimal type with a vector element larger than a byte,
// getMemsetStores() may create an intermediate splat (using an integer
// multiply) before we splat as a vector.
return MVT::v32i8;
if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v16i8;
// TODO: Can SSE1 handle a byte vector?
// If we have SSE1 registers we should be able to use them.
if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
(Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
// Also, do not use f64 to lower memset unless this is a memset of zeros.
// The gymnastics of splatting a byte value into an XMM register and then
// only using 8-byte stores (because this is a CPU with slow unaligned
// 16-byte accesses) makes that a loser.
return MVT::f64;
// This is a compromise. If we reach here, unaligned accesses may be slow on
// this target. However, creating smaller, aligned accesses could be even
// slower and would certainly be a lot more code.
if (Subtarget.is64Bit() && Op.size() >= 8)
return MVT::i64;
return MVT::i32;
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
if (VT == MVT::f32)
return X86ScalarSSEf32;
else if (VT == MVT::f64)
return X86ScalarSSEf64;
return true;
bool X86TargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
bool *Fast) const {
if (Fast) {
switch (VT.getSizeInBits()) {
// 8-byte and under are always assumed to be fast.
*Fast = true;
case 128:
*Fast = !Subtarget.isUnalignedMem16Slow();
case 256:
*Fast = !Subtarget.isUnalignedMem32Slow();
// TODO: What about AVX-512 (512-bit) accesses?
// NonTemporal vector memory ops must be aligned.
if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
// NT loads can only be vector aligned, so if its less aligned than the
// minimum vector size (which we can split the vector down to), we might as
// well use a regular unaligned vector load.
// We don't have any NT loads pre-SSE41.
if (!!(Flags & MachineMemOperand::MOLoad))
return (Align < 16 || !Subtarget.hasSSE41());
return false;
// Misaligned accesses of any size are always allowed.
return true;
/// Return the entry encoding for a jump table in the
/// current function. The returned value is a member of the
/// MachineJumpTableInfo::JTEntryKind enum.
unsigned X86TargetLowering::getJumpTableEncoding() const {
// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
// symbol.
if (isPositionIndependent() && Subtarget.isPICStyleGOT())
return MachineJumpTableInfo::EK_Custom32;
// Otherwise, use the normal jump table encoding heuristics.
return TargetLowering::getJumpTableEncoding();
bool X86TargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
ArgListTy &Args) const {
// Only relabel X86-32 for C / Stdcall CCs.
if (Subtarget.is64Bit())
if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
unsigned ParamRegs = 0;
if (auto *M = MF->getFunction().getParent())
ParamRegs = M->getNumberRegisterParameters();
// Mark the first N int arguments as having reg
for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
Type *T = Args[Idx].Ty;
if (T->isIntOrPtrTy())
if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
unsigned numRegs = 1;
if (MF->getDataLayout().getTypeAllocSize(T) > 4)
numRegs = 2;
if (ParamRegs < numRegs)
ParamRegs -= numRegs;
Args[Idx].IsInReg = true;
const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB,
unsigned uid,MCContext &Ctx) const{
assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
// entries.
return MCSymbolRefExpr::create(MBB->getSymbol(),
MCSymbolRefExpr::VK_GOTOFF, Ctx);
/// Returns relocation base for the given PIC jumptable.
SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
SelectionDAG &DAG) const {
if (!Subtarget.is64Bit())
// This doesn't have SDLoc associated with it, but is not really the
// same as a Register.
return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
return Table;
/// This returns the relocation base for the given PIC jumptable,
/// the same as getPICJumpTableRelocBase, but as an MCExpr.
const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
MCContext &Ctx) const {
// X86-64 uses RIP relative addressing based on the jump table label.
if (Subtarget.isPICStyleRIPRel())
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
// Otherwise, the reference is relative to the PIC base.
return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
std::pair<const TargetRegisterClass *, uint8_t>
X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
MVT VT) const {
const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
return TargetLowering::findRepresentativeClass(TRI, VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
case MVT::x86mmx:
RRC = &X86::VR64RegClass;
case MVT::f32: case MVT::f64:
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
case MVT::v4f32: case MVT::v2f64:
case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
case MVT::v8f32: case MVT::v4f64:
case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
case MVT::v16f32: case MVT::v8f64:
RRC = &X86::VR128XRegClass;
return std::make_pair(RRC, Cost);
unsigned X86TargetLowering::getAddressSpace() const {
if (Subtarget.is64Bit())
return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
return 256;
static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
static Constant* SegmentOffset(IRBuilder<> &IRB,
unsigned Offset, unsigned AddressSpace) {
return ConstantExpr::getIntToPtr(
ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
// glibc, bionic, and Fuchsia have a special slot for the stack guard in
// tcbhead_t; use it instead of the usual global variable (see
// sysdeps/{i386,x86_64}/nptl/tls.h)
if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
if (Subtarget.isTargetFuchsia()) {
// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
return SegmentOffset(IRB, 0x10, getAddressSpace());
} else {
// %fs:0x28, unless we're using a Kernel code model, in which case
// it's %gs:0x28. gs:0x14 on i386.
unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
return SegmentOffset(IRB, Offset, getAddressSpace());
return TargetLowering::getIRStackGuard(IRB);
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
// MSVC CRT has a global variable holding security cookie.
// MSVC CRT has a function to validate security cookie.
FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
"__security_check_cookie", Type::getVoidTy(M.getContext()),
if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
F->addAttribute(1, Attribute::AttrKind::InReg);
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
// MSVC CRT has a global variable holding security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
return M.getGlobalVariable("__security_cookie");
return TargetLowering::getSDagStackGuard(M);
Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
// MSVC CRT has a function to validate security cookie.
if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
return M.getFunction("__security_check_cookie");
return TargetLowering::getSSPStackGuardCheck(M);
Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
if (Subtarget.getTargetTriple().isOSContiki())
return getDefaultSafeStackPointerLocation(IRB, false);
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
if (Subtarget.isTargetAndroid()) {
// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
// %gs:0x24 on i386
unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
return SegmentOffset(IRB, Offset, getAddressSpace());
// Fuchsia is similar.
if (Subtarget.isTargetFuchsia()) {
// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
return SegmentOffset(IRB, 0x18, getAddressSpace());
return TargetLowering::getSafeStackPointerLocation(IRB);
bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
assert(SrcAS != DestAS && "Expected different address spaces!");
const TargetMachine &TM = getTargetMachine();
if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
return false;
return SrcAS < 256 && DestAS < 256;
// Return Value Calling Convention Implementation
bool X86TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC_X86);
const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
return ScratchRegs;
/// Lowers masks values (v*i1) to the local register values
/// \returns DAG node after lowering to register type
static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
const SDLoc &Dl, SelectionDAG &DAG) {
EVT ValVT = ValArg.getValueType();
if (ValVT == MVT::v1i1)
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
DAG.getIntPtrConstant(0, Dl));
if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
// Two stage lowering might be required
// bitcast: v8i1 -> i8 / v16i1 -> i16
// anyextend: i8 -> i32 / i16 -> i32
EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
if (ValLoc == MVT::i32)
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
return ValToCopy;
if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
// One stage lowering is required
// bitcast: v32i1 -> i32 / v64i1 -> i64
return DAG.getBitcast(ValLoc, ValArg);
return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
/// Breaks v64i1 value into two registers and adds the new node to the DAG
static void Passv64i1ArgInRegs(
const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
"The value should reside in two registers");
// Before splitting the value we cast it to i64
Arg = DAG.getBitcast(MVT::i64, Arg);
// Splitting the value into two i32 types
SDValue Lo, Hi;
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
DAG.getConstant(0, Dl, MVT::i32));
Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
DAG.getConstant(1, Dl, MVT::i32));
// Attach the two i32 types into corresponding registers
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &dl, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
// In some cases we need to disable registers from the default CSR list.
// For example, when they are used for argument passing.
bool ShouldDisableCalleeSavedRegister =
CallConv == CallingConv::X86_RegCall ||
if (CallConv == CallingConv::X86_INTR && !Outs.empty())
report_fatal_error("X86 interrupts may not return any value");
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
SmallVector<std::pair<Register, SDValue>, 4> RetVals;
for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
++I, ++OutsIndex) {
CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
// Add the register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
SDValue ValToCopy = OutVals[OutsIndex];
EVT ValVT = ValToCopy.getValueType();
// Promote values to the appropriate types.
if (VA.getLocInfo() == CCValAssign::SExt)
ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::ZExt)
ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::AExt) {
if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::BCvt)
ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
assert(VA.getLocInfo() != CCValAssign::FPExt &&
"Unexpected FP-extend for return value.");
// Report an error if we have attempted to return a value via an XMM
// register and SSE was disabled.
if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
} else if (!Subtarget.hasSSE2() &&
X86::FR64XRegClass.contains(VA.getLocReg()) &&
ValVT == MVT::f64) {
// When returning a double via an XMM register, report an error if SSE2 is
// not enabled.
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
// the RET instruction and handled by the FP Stackifier.
if (VA.getLocReg() == X86::FP0 ||
VA.getLocReg() == X86::FP1) {
// If this is a copy from an xmm register to ST(0), use an FPExtend to
// change the value to the FP stack register class.
if (isScalarFPTypeInSSEReg(VA.getValVT()))
ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
// Don't emit a copytoreg.
// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
// which is returned in RAX / RDX.
if (Subtarget.is64Bit()) {
if (ValVT == MVT::x86mmx) {
if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
// If we don't have SSE2 available, convert to v4f32 so the generated
// register is legal.
if (!Subtarget.hasSSE2())
ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
// Add the second register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
} else {
RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
SDValue Flag;
SmallVector<SDValue, 6> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
// Operand #1 = Bytes To Pop
RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
// Copy the result values into the output registers.
for (auto &RetVal : RetVals) {
if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
continue; // Don't emit a copytoreg.
Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
Flag = Chain.getValue(1);
DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
// Swift calling convention does not require we copy the sret argument
// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
// All x86 ABIs require that for returning structs by value we copy
// the sret argument into %rax/%eax (depending on ABI) for the return.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
// Checking Function.hasStructRetAttr() here is insufficient because the IR
// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
// false, then an sret argument may be implicitly inserted in the SelDAG. In
// either case FuncInfo->setSRetReturnReg() will have been called.
if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
// When we have both sret and another return value, we should use the
// original Chain stored in RetOps[0], instead of the current Chain updated
// in the above loop. If we only have sret, RetOps[0] equals to Chain.
// For the case of sret and another return value, we have
// Chain_0 at the function entry
// Chain_1 = getCopyToReg(Chain_0) in the above loop
// If we use Chain_1 in getCopyFromReg, we will have
// Val = getCopyFromReg(Chain_1)
// Chain_2 = getCopyToReg(Chain_1, Val) from below
// getCopyToReg(Chain_0) will be glued together with
// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
// Data dependency from Unit B to Unit A due to usage of Val in
// getCopyToReg(Chain_1, Val)
// Chain dependency from Unit A to Unit B
// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
Register RetValReg
= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
X86::RAX : X86::EAX;
Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
Flag = Chain.getValue(1);
// RAX/EAX now acts like a return value.
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
// Add the returned register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const MCPhysReg *I =
if (I) {
for (; *I; ++I) {
if (X86::GR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
if (Flag.getNode())
X86ISD::NodeType opcode = X86ISD::RET_FLAG;
if (CallConv == CallingConv::X86_INTR)
opcode = X86ISD::IRET;
return DAG.getNode(opcode, dl, MVT::Other, RetOps);
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
return false;
SDValue TCChain = Chain;
SDNode *Copy = *N->use_begin();
if (Copy->getOpcode() == ISD::CopyToReg) {
// If the copy has a glue operand, we conservatively assume it isn't safe to
// perform a tail call.
if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
return false;
TCChain = Copy->getOperand(0);
} else if (Copy->getOpcode() != ISD::FP_EXTEND)
return false;
bool HasRet = false;
for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
UI != UE; ++UI) {
if (UI->getOpcode() != X86ISD::RET_FLAG)
return false;
// If we are returning more than one value, we can definitely
// not make a tail call see PR19530
if (UI->getNumOperands() > 4)
return false;
if (UI->getNumOperands() == 4 &&
UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
return false;
HasRet = true;
if (!HasRet)
return false;
Chain = TCChain;
return true;
EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
ISD::NodeType ExtendKind) const {
MVT ReturnMVT = MVT::i32;
bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
// The ABI does not require i1, i8 or i16 to be extended.
// On Darwin, there is code in the wild relying on Clang's old behaviour of
// always extending i8/i16 return values, so keep doing that for now.
// (PR26665).
ReturnMVT = MVT::i8;
EVT MinVT = getRegisterType(Context, ReturnMVT);
return VT.bitsLT(MinVT) ? MinVT : VT;
/// Reads two 32 bit registers and creates a 64 bit mask value.
/// \param VA The current 32 bit value that need to be assigned.
/// \param NextVA The next 32 bit value that need to be assigned.
/// \param Root The parent DAG node.
/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
/// glue purposes. In the case the DAG is already using
/// physical register instead of virtual, we should glue
/// our new SDValue to InFlag SDvalue.
/// \return a new SDvalue of size 64bit.
static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
SDValue &Root, SelectionDAG &DAG,
const SDLoc &Dl, const X86Subtarget &Subtarget,
SDValue *InFlag = nullptr) {
assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
assert(VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type");
assert(NextVA.getValVT() == VA.getValVT() &&
"The locations should have the same type");
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
"The values should reside in two registers");
SDValue Lo, Hi;
SDValue ArgValueLo, ArgValueHi;
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterClass *RC = &X86::GR32RegClass;
// Read a 32 bit value from the registers.
if (nullptr == InFlag) {
// When no physical register is present,
// create an intermediate virtual register.
Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
} else {
// When a physical register is available read the value from it and glue
// the reads together.
ArgValueLo =
DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
*InFlag = ArgValueLo.getValue(2);
ArgValueHi =
DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
*InFlag = ArgValueHi.getValue(2);
// Convert the i32 type into v32i1 type.
Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
// Convert the i32 type into v32i1 type.
Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
// Concatenate the two values together.
return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
/// The function will lower a register of various sizes (8/16/32/64)
/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
/// \returns a DAG node contains the operand after lowering to mask type.
static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
const EVT &ValLoc, const SDLoc &Dl,
SelectionDAG &DAG) {
SDValue ValReturned = ValArg;
if (ValVT == MVT::v1i1)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
if (ValVT == MVT::v64i1) {
// In 32 bit machine, this case is handled by getv64i1Argument
assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
// In 64 bit machine, There is no need to truncate the value only bitcast
} else {
MVT maskLen;
switch (ValVT.getSimpleVT().SimpleTy) {
case MVT::v8i1:
maskLen = MVT::i8;
case MVT::v16i1:
maskLen = MVT::i16;
case MVT::v32i1:
maskLen = MVT::i32;
llvm_unreachable("Expecting a vector of i1 types");
ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
return DAG.getBitcast(ValVT, ValReturned);
/// Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue X86TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
uint32_t *RegMask) const {
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
++I, ++InsIndex) {
CCValAssign &VA = RVLocs[I];
EVT CopyVT = VA.getLocVT();
// In some calling conventions we need to remove the used registers
// from the register mask.
if (RegMask) {
for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
SubRegs.isValid(); ++SubRegs)
RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
// Report an error if there was an attempt to return FP values via XMM
// registers.
if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
if (VA.getLocReg() == X86::XMM1)
VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
} else if (!Subtarget.hasSSE2() &&
X86::FR64XRegClass.contains(VA.getLocReg()) &&
CopyVT == MVT::f64) {
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
if (VA.getLocReg() == X86::XMM1)
VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
// If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg.
bool RoundAfterCopy = false;
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
isScalarFPTypeInSSEReg(VA.getValVT())) {
if (!Subtarget.hasX87())
report_fatal_error("X87 register return with X87 disabled");
CopyVT = MVT::f80;
RoundAfterCopy = (CopyVT != VA.getLocVT());
SDValue Val;
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
Val =
getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
} else {
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
Val = Chain.getValue(0);
InFlag = Chain.getValue(2);
if (RoundAfterCopy)
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
// This truncation won't change the value.
DAG.getIntPtrConstant(1, dl));
if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
if (VA.getValVT().isVector() &&
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
} else
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
if (VA.getLocInfo() == CCValAssign::BCvt)
Val = DAG.getBitcast(VA.getValVT(), Val);
return Chain;
// C & StdCall & Fast Calling Convention implementation
// StdCall calling convention seems to be standard for many Windows' API
// routines and around. It differs from C calling convention just a little:
// callee should clean up the stack, not caller. Symbols should be also
// decorated in some fancy way :) It doesn't support any vector arguments.
// For info on fast calling convention see Fast Calling Convention (tail call)
// implementation LowerX86_32FastCCCallTo.
/// CallIsStructReturn - Determines whether a call uses struct return
/// semantics.
enum StructReturnType {
static StructReturnType
callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
if (Outs.empty())
return NotStructReturn;
const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
if (!Flags.isSRet())
return NotStructReturn;
if (Flags.isInReg() || IsMCU)
return RegStructReturn;
return StackStructReturn;
/// Determines whether a function uses struct return semantics.
static StructReturnType
argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
if (Ins.empty())
return NotStructReturn;
const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
if (!Flags.isSRet())
return NotStructReturn;
if (Flags.isInReg() || IsMCU)
return RegStructReturn;
return StackStructReturn;
/// Make a copy of an aggregate at address specified by "Src" to address
/// "Dst" with size and alignment information specified by the specific
/// parameter attribute. The copy will be passed as a byval function parameter.
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SDValue Chain, ISD::ArgFlagsTy Flags,
SelectionDAG &DAG, const SDLoc &dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
return DAG.getMemcpy(
Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
/*isVolatile*/ false, /*AlwaysInline=*/true,
/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
/// Return true if the calling convention is one that we can guarantee TCO for.
static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
CC == CallingConv::HHVM || CC == CallingConv::Tail);
/// Return true if we might ever do TCO for calls with this calling convention.
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
// C calling conventions:
case CallingConv::C:
case CallingConv::Win64:
case CallingConv::X86_64_SysV:
// Callee pop conventions:
case CallingConv::X86_ThisCall:
case CallingConv::X86_StdCall:
case CallingConv::X86_VectorCall:
case CallingConv::X86_FastCall:
// Swift:
case CallingConv::Swift:
return true;
return canGuaranteeTCO(CC);
/// Return true if the function is being made into a tailcall target by
/// changing its ABI.
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!CI->isTailCall())
return false;
CallingConv::ID CalleeCC = CI->getCallingConv();
if (!mayTailCallThisCC(CalleeCC))
return false;
return true;
X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &dl, SelectionDAG &DAG,
const CCValAssign &VA,
MachineFrameInfo &MFI, unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
bool AlwaysUseMutable = shouldGuaranteeTCO(
CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
MVT PtrVT = getPointerTy(DAG.getDataLayout());
// If value is passed by pointer we have address passed instead of the value
// itself. No need to extend if the mask value and location share the same
// absolute size.
bool ExtendedInMem =
VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
ValVT = VA.getLocVT();
ValVT = VA.getValVT();
// FIXME: For now, all byval parameter objects are marked mutable. This can be
// changed with more analysis.
// In case of tail call optimization mark all arguments mutable. Since they
// could be overwritten by lowering of arguments in case of a tail call.
if (Flags.isByVal()) {
unsigned Bytes = Flags.getByValSize();
if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
// FIXME: For now, all byval parameter objects are marked as aliasing. This
// can be improved with deeper analysis.
int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
return DAG.getFrameIndex(FI, PtrVT);
// This is an argument in memory. We might be able to perform copy elision.
// If the argument is passed directly in memory without any extension, then we
// can perform copy elision. Large vector types, for example, may be passed
// indirectly by pointer.
if (Flags.isCopyElisionCandidate() &&
VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
EVT ArgVT = Ins[i].ArgVT;
SDValue PartAddr;
if (Ins[i].PartOffset == 0) {
// If this is a one-part value or the first part of a multi-part value,
// create a stack object for the entire argument value type and return a
// load from our portion of it. This assumes that if the first part of an
// argument is in memory, the rest will also be in memory.
int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
PartAddr = DAG.getFrameIndex(FI, PtrVT);
return DAG.getLoad(
ValVT, dl, Chain, PartAddr,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
} else {
// This is not the first piece of an argument in memory. See if there is
// already a fixed stack object including this offset. If so, assume it
// was created by the PartOffset == 0 branch above and create a load from
// the appropriate offset into it.
int64_t PartBegin = VA.getLocMemOffset();
int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
int FI = MFI.getObjectIndexBegin();
for (; MFI.isFixedObjectIndex(FI); ++FI) {
int64_t ObjBegin = MFI.getObjectOffset(FI);
int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
if (MFI.isFixedObjectIndex(FI)) {
SDValue Addr =
DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
return DAG.getLoad(
ValVT, dl, Chain, Addr,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
VA.getLocMemOffset(), isImmutable);
// Set SExt or ZExt flag.
if (VA.getLocInfo() == CCValAssign::ZExt) {
MFI.setObjectZExt(FI, true);
} else if (VA.getLocInfo() == CCValAssign::SExt) {
MFI.setObjectSExt(FI, true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
return ExtendedInMem
? (VA.getValVT().isVector()
? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
: Val;
// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
const X86Subtarget &Subtarget) {
if (Subtarget.isCallingConvWin64(CallConv)) {
static const MCPhysReg GPR64ArgRegsWin64[] = {
X86::RCX, X86::RDX, X86::R8, X86::R9
return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
static const MCPhysReg GPR64ArgRegs64Bit[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
// FIXME: Get this from tablegen.
static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
CallingConv::ID CallConv,
const X86Subtarget &Subtarget) {
if (Subtarget.isCallingConvWin64(CallConv)) {
// The XMM registers which might contain var arg parameters are shadowed
// in their paired GPR. So we only need to save the GPR to their home
// slots.
// TODO: __vectorcall will change this.
return None;
const Function &F = MF.getFunction();
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool isSoftFloat = Subtarget.useSoftFloat();
assert(!(isSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
// Kernel mode asks for SSE to be disabled, so there are no XMM argument
// registers.
return None;
static const MCPhysReg XMMArgRegs64Bit[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
#ifndef NDEBUG
static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
return llvm::is_sorted(
ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
return A.getValNo() < B.getValNo();
namespace {
/// This is a helper class for lowering variable arguments parameters.
class VarArgsLoweringHelper {
VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
SelectionDAG &DAG, const X86Subtarget &Subtarget,
CallingConv::ID CallConv, CCState &CCInfo)
: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
CCInfo(CCInfo) {}
// Lower variable arguments parameters.
void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
void forwardMustTailParameters(SDValue &Chain);
bool is64Bit() { return Subtarget.is64Bit(); }
bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }
X86MachineFunctionInfo *FuncInfo;
const SDLoc &DL;
SelectionDAG &DAG;
const X86Subtarget &Subtarget;
MachineFunction &TheMachineFunction;
const Function &TheFunction;
MachineFrameInfo &FrameInfo;
const TargetFrameLowering &FrameLowering;
const TargetLowering &TargLowering;
CallingConv::ID CallConv;
CCState &CCInfo;
} // namespace
void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
SDValue &Chain, unsigned StackSize) {
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start. We
// can skip this if there are no va_start calls.
if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
CallConv != CallingConv::X86_ThisCall)) {
FrameInfo.CreateFixedObject(1, StackSize, true));
// Figure out if XMM registers are in use.
assert(!(Subtarget.useSoftFloat() &&
TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!");
// 64-bit calling conventions support varargs and register parameters, so we
// have to do extra work to spill them in the prologue.
if (is64Bit()) {
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs =
get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
if (isWin64()) {
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
// Fixup to set vararg frame on shadow area (4 x i64).
if (NumIntRegs < 4)
} else {
// For X86-64, if there are vararg parameters that are passed via
// registers, then we must store them to their spots on the stack so
// they may be loaded by dereferencing the result of va_next.
FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
SmallVector<SDValue, 6>
LiveGPRs; // list of SDValue for GPR registers keeping live input value
SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
// keeping live input value
SDValue ALVal; // if applicable keeps SDValue for %al register
// Gather all the live in physical registers.
for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
if (!AvailableXmms.empty()) {
Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
for (MCPhysReg Reg : AvailableXmms) {
Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
// Store the integer parameter registers.
SmallVector<SDValue, 8> MemOps;
unsigned Offset = FuncInfo->getVarArgsGPOffset();
for (SDValue Val : LiveGPRs) {
SDValue FIN = DAG.getNode(ISD::ADD, DL,
RSFIN, DAG.getIntPtrConstant(Offset, DL));
SDValue Store =
DAG.getStore(Val.getValue(1), DL, Val, FIN,
FuncInfo->getRegSaveFrameIndex(), Offset));
Offset += 8;
// Now store the XMM (fp + vector) parameter registers.
if (!LiveXMMRegs.empty()) {
SmallVector<SDValue, 12> SaveXMMOps;
DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));
DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));
SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
MVT::Other, SaveXMMOps));
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
// Find the largest legal vector type.
MVT VecVT = MVT::Other;
// FIXME: Only some x86_32 calling conventions support AVX512.
if (Subtarget.useAVX512Regs() &&
(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
CallConv == CallingConv::Intel_OCL_BI)))
VecVT = MVT::v16f32;
else if (Subtarget.hasAVX())
VecVT = MVT::v8f32;
else if (Subtarget.hasSSE2())
VecVT = MVT::v4f32;
// We forward some GPRs and some vector types.
SmallVector<MVT, 2> RegParmTypes;
MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
if (VecVT != MVT::Other)
// Compute the set of forwarded registers. The rest are scratch.
SmallVectorImpl<ForwardedRegister> &Forwards =
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
// Forward AL for SysV x86_64 targets, since it is used for varargs.
if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
// Copy all forwards from physical to virtual registers.
for (ForwardedRegister &FR : Forwards) {
// FIXME: Can we use a less constrained schedule?
SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
unsigned StackSize) {
// Set FrameIndex to the 0xAAAAAAA value to mark unset state.
// If necessary, it would be set into the correct value later.
if (FrameInfo.hasVAStart())
createVarArgAreaAndStoreRegisters(Chain, StackSize);
if (FrameInfo.hasMustTailInVarArgFunc())
SDValue X86TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const Function &F = MF.getFunction();
if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
F.getName() == "main")
MachineFrameInfo &MFI = MF.getFrameInfo();
bool Is64Bit = Subtarget.is64Bit();
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
!(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64.
if (IsWin64)
CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeArguments(Ins, CC_X86);
// In vectorcall calling convention a second pass is required for the HVA
// types.
if (CallingConv::X86_VectorCall == CallConv) {
CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
// The next loop assumes that the locations are in the same order of the
// input arguments.
assert(isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering");
SDValue ArgValue;
for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
++I, ++InsIndex) {
assert(InsIndex < Ins.size() && "Invalid Ins index");
CCValAssign &VA = ArgLocs[I];
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
if (VA.needsCustom()) {
VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
// v64i1 values, in regcall calling convention, that are
// compiled to 32 bit arch, are split up into two registers.
ArgValue =
getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
} else {
const TargetRegisterClass *RC;
if (RegVT == MVT::i8)
RC = &X86::GR8RegClass;
else if (RegVT == MVT::i16)
RC = &X86::GR16RegClass;
else if (RegVT == MVT::i32)
RC = &X86::GR32RegClass;
else if (Is64Bit && RegVT == MVT::i64)
RC = &X86::GR64RegClass;
else if (RegVT == MVT::f32)
RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
else if (RegVT == MVT::f64)
RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
else if (RegVT == MVT::f80)
RC = &X86::RFP80RegClass;
else if (RegVT == MVT::f128)
RC = &X86::VR128RegClass;
else if (RegVT.is512BitVector())
RC = &X86::VR512RegClass;
else if (RegVT.is256BitVector())
RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
else if (RegVT.is128BitVector())
RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
RC = &X86::VR64RegClass;
else if (RegVT == MVT::v1i1)
RC = &X86::VK1RegClass;
else if (RegVT == MVT::v8i1)
RC = &X86::VK8RegClass;
else if (RegVT == MVT::v16i1)
RC = &X86::VK16RegClass;
else if (RegVT == MVT::v32i1)
RC = &X86::VK32RegClass;
else if (RegVT == MVT::v64i1)
RC = &X86::VK64RegClass;
llvm_unreachable("Unknown argument type!");
Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
// If this is an 8 or 16-bit value, it is really passed promoted to 32
// bits. Insert an assert[sz]ext to capture this, then truncate to the
// right size.
if (VA.getLocInfo() == CCValAssign::SExt)
ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
else if (VA.getLocInfo() == CCValAssign::ZExt)
ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
else if (VA.getLocInfo() == CCValAssign::BCvt)
ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
if (VA.isExtInLoc()) {
// Handle MMX values passed in XMM regs.
if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
else if (VA.getValVT().isVector() &&
VA.getValVT().getScalarType() == MVT::i1 &&
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
} else
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
} else {
ArgValue =
LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
// If value is passed via pointer - do a load.
if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
ArgValue =
DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
// Swift calling convention does not require we copy the sret argument
// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
if (CallConv == CallingConv::Swift)
// All x86 ABIs require that for returning structs by value we copy the
// sret argument into %rax/%eax (depending on ABI) for the return. Save
// the argument into a virtual register so that we can access it from the
// return points.
if (Ins[I].Flags.isSRet()) {
Register Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
MVT PtrTy = getPointerTy(DAG.getDataLayout());
Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
unsigned StackSize = CCInfo.getNextStackOffset();
// Align stack specially for tail calls.
if (shouldGuaranteeTCO(CallConv,
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
if (IsVarArg)
VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
.lowerVarArgsParameters(Chain, StackSize);
// Some CCs need callee pop.
if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
// X86 interrupts must pop the error code (and the alignment padding) if
// present.
FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget.getTargetTriple().isOSMSVCRT() &&
argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
if (!Is64Bit) {
// RegSaveFrameIndex is X86-64 only.
if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
if (Personality == EHPersonality::CoreCLR) {
// TODO: Add a mechanism to frame lowering that will allow us to indicate
// that we'd prefer this slot be allocated towards the bottom of the frame
// (i.e. near the stack pointer after allocating the frame). Every
// funclet needs a copy of this slot in its (mostly empty) frame, and the
// offset from the bottom of this and each funclet's frame must be the
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
// space to accommodate holding this slot at the correct offset).
int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
if (CallConv == CallingConv::X86_RegCall ||
F.hasFnAttribute("no_caller_saved_registers")) {
MachineRegisterInfo &MRI = MF.getRegInfo();
for (std::pair<Register, Register> Pair : MRI.liveins())
return Chain;
SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
SDValue Arg, const SDLoc &dl,
SelectionDAG &DAG,
const CCValAssign &VA,
ISD::ArgFlagsTy Flags,
bool isByVal) const {
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, PtrOff);
if (isByVal)
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
return DAG.getStore(
Chain, dl, Arg, PtrOff,
MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
/// Emit a load of return address if tail call
/// optimization is performed and it is required.
SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
bool Is64Bit, int FPDiff, const SDLoc &dl) const {
// Adjust the Return address stack slot.
EVT VT = getPointerTy(DAG.getDataLayout());
OutRetAddr = getReturnAddressFrameIndex(DAG);
// Load the "old" Return address.
OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
return SDValue(OutRetAddr.getNode(), 1);
/// Emit a store of the return address if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
SDValue Chain, SDValue RetAddrFrIdx,
EVT PtrVT, unsigned SlotSize,
int FPDiff, const SDLoc &dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
int NewReturnAddrFI =
MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
DAG.getMachineFunction(), NewReturnAddrFI));
return Chain;
/// Returns a vector_shuffle mask for an movs{s|d}, movd
/// operation of specified width.
static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> Mask;
for (unsigned i = 1; i != NumElems; ++i)
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &dl = CLI.DL;
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
CallingConv::ID CallConv = CLI.CallConv;
bool &isTailCall = CLI.IsTailCall;
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget.is64Bit();
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
bool HasNoCfCheck =
(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
MachineFunction::CallSiteInfo CSInfo;
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
// relocation, which forces early binding of the symbol. This breaks code
// that require lazy function symbol resolution. Using musttail or
// GuaranteedTailCallOpt will override this.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (!G || (!G->getGlobal()->hasLocalLinkage() &&
isTailCall = false;
bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
if (IsMustTail) {
// Force this to be a tail call. The verifier rules are enough to ensure
// that we can lower this successfully without moving the return address
// around.
isTailCall = true;
} else if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
MF.getFunction().hasStructRetAttr(), CLI.RetTy,
Outs, OutVals, Ins, DAG);
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
if (!IsGuaranteeTCO && isTailCall)
IsSibcall = true;
if (isTailCall)
assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64.
if (IsWin64)
CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeArguments(Outs, CC_X86);
// In vectorcall calling convention a second pass is required for the HVA
// types.
if (CallingConv::X86_VectorCall == CallConv) {
CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
if (IsSibcall)
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
if (isTailCall && !IsSibcall && !IsMustTail) {
// Lower arguments at fp - stackoffset + fpdiff.
unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
FPDiff = NumBytesCallerPushed - NumBytes;
// Set the delta of movement of the returnaddr stackslot.
// But only set if delta is greater than previous delta.
if (FPDiff < X86Info->getTCReturnAddrDelta())
unsigned NumBytesToPush = NumBytes;
unsigned NumBytesToPop = NumBytes;
// If we have an inalloca argument, all stack space has already been allocated
// for us and be right at the top of the stack. We don't support multiple
// arguments passed in memory when using inalloca.
if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
NumBytesToPush = 0;
if (!ArgLocs.back().isMemLoc())
report_fatal_error("cannot use inalloca attribute on a register "
if (ArgLocs.back().getLocMemOffset() != 0)
report_fatal_error("any parameter with the inalloca attribute must be "
"the only memory argument");
} else if (CLI.IsPreallocated) {
assert(ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register "
SmallVector<size_t, 4> PreallocatedOffsets;
for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
NumBytesToPush = 0;
if (!IsSibcall && !IsMustTail)
Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
NumBytes - NumBytesToPush, dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
if (isTailCall && FPDiff)
Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
Is64Bit, FPDiff, dl);
SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
// The next loop assumes that the locations are in the same order of the
// input arguments.
assert(isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering");
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
++I, ++OutIndex) {
assert(OutIndex < Outs.size() && "Invalid Out index");
// Skip inalloca/preallocated arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
if (Flags.isInAlloca() || Flags.isPreallocated())
CCValAssign &VA = ArgLocs[I];
EVT RegVT = VA.getLocVT();
SDValue Arg = OutVals[OutIndex];
bool isByVal = Flags.isByVal();
// Promote the value if needed.
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
case CCValAssign::AExt:
if (Arg.getValueType().isVector() &&
Arg.getValueType().getVectorElementType() == MVT::i1)
Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
else if (RegVT.is128BitVector()) {
// Special case: passing MMX values in XMM registers.
Arg = DAG.getBitcast(MVT::i64, Arg);
Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
} else
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
case CCValAssign::BCvt:
Arg = DAG.getBitcast(RegVT, Arg);
case CCValAssign::Indirect: {
if (isByVal) {
// Memcpy the argument to a temporary stack slot to prevent
// the caller from seeing any modifications the callee may make
// as guaranteed by the `byval` attribute.
int FrameIdx = MF.getFrameInfo().CreateStackObject(
std::max(Align(16), Flags.getNonZeroByValAlign()), false);
SDValue StackSlot =
DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
Chain =
CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
// From now on treat this as a regular pointer
Arg = StackSlot;
isByVal = false;
} else {
// Store the argument.
SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
Chain = DAG.getStore(
Chain, dl, Arg, SpillSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
Arg = SpillSlot;
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
// Split v64i1 value into two registers
Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
const TargetOptions &Options = DAG.getTarget().Options;
if (Options.EmitCallSiteInfo)
CSInfo.emplace_back(VA.getLocReg(), I);
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
// shadow reg if callee is a varargs function.
Register ShadowReg;
switch (VA.getLocReg()) {
case X86::XMM0: ShadowReg = X86::RCX; break;
case X86::XMM1: ShadowReg = X86::RDX; break;
case X86::XMM2: ShadowReg = X86::R8; break;
case X86::XMM3: ShadowReg = X86::R9; break;
if (ShadowReg)
RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
} else if (!IsSibcall && (!isTailCall || isByVal)) {
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
dl, DAG, VA, Flags, isByVal));
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
if (Subtarget.isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// GOT pointer.
if (!isTailCall) {
Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
} else {
// If we are tail calling and generating PIC/GOT style code load the
// address of the callee into ECX. The value in ecx is used as target of
// the tail jump. This is done to circumvent the ebx/callee-saved problem
// for tail calls on PIC/GOT architectures. Normally we would just put the
// address of GOT into ebx and then call target@PLT. But for tail calls
// ebx would be restored (since ebx is callee saved) before jumping to the
// target@PLT.
// Note: The actual moving to ECX is done further down.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
if (G && !G->getGlobal()->hasLocalLinkage() &&
Callee = LowerGlobalAddress(Callee, DAG);
else if (isa<ExternalSymbolSDNode>(Callee))
Callee = LowerExternalSymbol(Callee, DAG);
if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
// (prototype-less calls or calls to functions containing ellipsis (...) in
// the declaration) %al is used as hidden argument to specify the number
// of SSE registers used. The contents of %al do not need to match exactly
// the number of registers, but must be an ubound on the number of SSE
// registers used and is in the range 0 - 8 inclusive.
// Count the number of XMM registers allocated.
static const MCPhysReg XMMArgRegs[] = {
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget.hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
DAG.getConstant(NumXMMRegs, dl,
if (isVarArg && IsMustTail) {
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
RegsToPass.push_back(std::make_pair(F.PReg, Val));
// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
// don't need this because the eligibility check rejects calls that require
// shuffling arguments passed in memory.
if (!IsSibcall && isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
// before any new outgoing arguments are stored to the stack, because the
// outgoing stack slots may alias the incoming argument stack slots, and
// the alias isn't otherwise explicit. This is slightly more conservative
// than necessary, because it means that each store effectively depends
// on every argument instead of just those arguments it would clobber.
SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
++I, ++OutsIndex) {
CCValAssign &VA = ArgLocs[I];
if (VA.isRegLoc()) {
if (VA.needsCustom()) {
assert((CallConv == CallingConv::X86_RegCall) &&
"Expecting custom case only in regcall calling convention");
// This means that we are in special case where one argument was
// passed through two register locations - Skip the next location
SDValue Arg = OutVals[OutsIndex];
ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
// Skip inalloca/preallocated arguments. They don't require any work.
if (Flags.isInAlloca() || Flags.isPreallocated())
// Create frame index.
int32_t Offset = VA.getLocMemOffset()+FPDiff;
uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
if (Flags.isByVal()) {
// Copy relative to framepointer.
SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, Source);
MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
Flags, DAG, dl));
} else {
// Store relative to framepointer.
ArgChain, dl, Arg, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
if (!MemOpChains2.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
RegInfo->getSlotSize(), FPDiff, dl);
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into registers.
SDValue InFlag;
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
RegsToPass[i].second, InFlag);
InFlag = Chain.getValue(1);
if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
// In the 64-bit large code model, we have to make all calls
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
} else if (Callee->getOpcode() == ISD::GlobalAddress ||
Callee->getOpcode() == ISD::ExternalSymbol) {
// Lower direct calls to global addresses and external symbols. Setting
// ForCall to true here has the effect of removing WrapperRIP when possible
// to allow direct calls to be selected without first materializing the
// address into a register.
Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
} else if (Subtarget.isTarget64BitILP32() &&
Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
// Returns a chain & a flag for retval copy to use.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SmallVector<SDValue, 8> Ops;
if (!IsSibcall && isTailCall && !IsMustTail) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getIntPtrConstant(NumBytesToPop, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
InFlag = Chain.getValue(1);
if (isTailCall)
Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
// Add argument registers to the end of the list so that they are known live
// into the call.
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
// Add a register mask operand representing the call-preserved registers.
// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
// set X86_INTR calling convention because it has the same CSR mask
// (same preserved registers).
const uint32_t *Mask = RegInfo->getCallPreservedMask(
MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
// If this is an invoke in a 32-bit function using a funclet-based
// personality, assume the function clobbers all registers. If an exception
// is thrown, the runtime will not restore CSRs.
// FIXME: Model this more precisely so that we can register allocate across
// the normal edge and spill and fill across the exceptional edge.
if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
const Function &CallerFn = MF.getFunction();
EHPersonality Pers =
? classifyEHPersonality(CallerFn.getPersonalityFn())
: EHPersonality::Unknown;
if (isFuncletEHPersonality(Pers))
Mask = RegInfo->getNoPreservedMask();
// Define a new register mask from the existing mask.
uint32_t *RegMask = nullptr;
// In some calling conventions we need to remove the used physical registers
// from the reg mask.
if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Allocate a new Reg Mask and copy Mask.
RegMask = MF.allocateRegMask();
unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
// Make sure all sub registers of the argument registers are reset
// in the RegMask.
for (auto const &RegPair : RegsToPass)
for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
SubRegs.isValid(); ++SubRegs)
RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
// Create the RegMask Operand according to our updated mask.
} else {
// Create the RegMask Operand according to the static mask.
if (InFlag.getNode())
if (isTailCall) {
// We used to do:
//// If this is the first return lowered for this function, add the regs
//// to the liveout set for the function.
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
return Ret;
if (HasNoCfCheck && IsCFProtectionSupported) {
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
} else {
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
InFlag = Chain.getValue(1);
DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
// Save heapallocsite metadata.
if (CLI.CB)
if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
NumBytesForCalleeToPop = NumBytes; // Callee pops everything
else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget.getTargetTriple().isOSMSVCRT() &&
SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
// This is common for Darwin/X86, Linux & Mingw32 targets.
// For MSVC Win32 targets, the caller pops the hidden struct pointer.
NumBytesForCalleeToPop = 4;
NumBytesForCalleeToPop = 0; // Callee pops nothing.
// Returns a flag for retval copy to use.
if (!IsSibcall) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getIntPtrConstant(NumBytesToPop, dl, true),
DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
InFlag, dl);
InFlag = Chain.getValue(1);
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
InVals, RegMask);
// Fast Calling Convention (tail call) implementation
// Like std call, callee cleans arguments, convention except that ECX is
// reserved for storing the tail called function address. Only 2 registers are
// free for argument passing (inreg). Tail call optimization is performed
// provided:
// * tailcallopt is enabled
// * caller/callee are fastcc
// On X86_64 architecture with GOT-style position independent code only local
// (within module) calls are supported at the moment.
// To keep the stack aligned according to platform abi the function
// GetAlignedArgumentStackSize ensures that argument delta is always multiples
// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
// If a tail called function callee has more arguments than the caller the
// caller needs to make sure that there is room to move the RETADDR to. This is
// achieved by reserving an area the size of the argument delta right after the
// original RETADDR, but before the saved framepointer or the spilled registers
// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
// stack layout:
// arg1
// arg2
// [ new RETADDR
// move area ]
// (possible EBP)
// ESI
// EDI
// local1 ..
/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
/// requirement.
X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
SelectionDAG &DAG) const {
const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
assert(StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize");
return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
/// Return true if the given stack call argument is already available in the
/// same position (relatively) of the caller's incoming argument stack.
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
const X86InstrInfo *TII, const CCValAssign &VA) {
unsigned Bytes = Arg.getValueSizeInBits() / 8;
for (;;) {
// Look through nodes that don't alter the bits of the incoming value.
unsigned Op = Arg.getOpcode();
if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
Arg = Arg.getOperand(0);
if (Op == ISD::TRUNCATE) {
const SDValue &TruncInput = Arg.getOperand(0);
if (TruncInput.getOpcode() == ISD::AssertZext &&
cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
Arg.getValueType()) {
Arg = TruncInput.getOperand(0);
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
if (!Register::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
return false;
if (!Flags.isByVal()) {
if (!TII->isLoadFromStackSlot(*Def, FI))
return false;
} else {
unsigned Opcode = Def->getOpcode();
if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
Opcode == X86::LEA64_32r) &&
Def->getOperand(1).isFI()) {
FI = Def->getOperand(1).getIndex();
Bytes = Flags.getByValSize();
} else
return false;
} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
if (Flags.isByVal())
// ByVal argument is passed in as a pointer but it's now being
// dereferenced. e.g.
// define @foo(%struct.X* %A) {
// tail call @bar(%struct.X* byval %A)
// }
return false;
SDValue Ptr = Ld->getBasePtr();
FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
if (!FINode)
return false;
FI = FINode->getIndex();
} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
FI = FINode->getIndex();
Bytes = Flags.getByValSize();
} else
return false;
assert(FI != INT_MAX);
if (!MFI.isFixedObjectIndex(FI))
return false;
if (Offset != MFI.getObjectOffset(FI))
return false;
// If this is not byval, check that the argument stack object is immutable.
// inalloca and argument copy elision can create mutable argument stack
// objects. Byval objects can be mutated, but a byval call intends to pass the
// mutated memory.
if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
return false;
if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
// If the argument location is wider than the argument type, check that any
// extension flags match.
if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
Flags.isSExt() != MFI.isObjectSExt(FI)) {
return false;
return Bytes == MFI.getObjectSize(FI);
/// Check whether the call is eligible for tail call optimization. Targets
/// that want to do tail call optimization should implement this function.
bool X86TargetLowering::IsEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
if (!mayTailCallThisCC(CalleeCC))
return false;
// If -tailcallopt is specified, make fastcc functions tail-callable.
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
// then the FP_EXTEND of the call result is not a nop. It's not safe to
// perform a tailcall optimization here.
if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
return false;
CallingConv::ID CallerCC = CallerF.getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
CalleeCC == CallingConv::Tail;
// Win64 functions have extra shadow space for argument homing. Don't do the
// sibcall if the caller and callee have mismatched expectations for this
// space.
if (IsCalleeWin64 != IsCallerWin64)
return false;
if (IsGuaranteeTCO) {
if (canGuaranteeTCO(CalleeCC) && CCMatch)
return true;
return false;
// Look for obvious safe cases to perform tail call optimization that do not
// require ABI changes. This is what gcc calls sibcall.
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
if (isCalleeStructRet || isCallerStructRet)
return false;
// Do not sibcall optimize vararg calls unless all arguments are passed via
// registers.
LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// Optimizing for varargs on Win64 is unlikely to be safe without
// additional testing.
if (IsCalleeWin64 || IsCallerWin64)
return false;
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
if (!ArgLocs[i].isRegLoc())
return false;
// If the call result is in ST0 / ST1, it needs to be popped off the x87
// stack. Therefore, if it's not used by the call it is not safe to optimize
// this into a sibcall.
bool Unused = false;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
if (!Ins[i].Used) {
Unused = true;
if (Unused) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
return false;
// Check that the call results are passed in the same way.
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
RetCC_X86, RetCC_X86))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
unsigned StackArgsSize = 0;
// If the callee takes no arguments then go on to check the results of the
// call.
if (!Outs.empty()) {
// Check if stack adjustment is needed. For now, do not do this if any
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
// Allocate shadow area for Win64
if (IsCalleeWin64)
CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
StackArgsSize = CCInfo.getNextStackOffset();
if (CCInfo.getNextStackOffset()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
MachineFrameInfo &MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
ISD::ArgFlagsTy Flags = Outs[i].Flags;
if (VA.getLocInfo() == CCValAssign::Indirect)
return false;
if (!VA.isRegLoc()) {
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
return false;
bool PositionIndependent = isPositionIndependent();
// If the tailcall address may be in a register, then make sure it's
// possible to register allocate for it. In 32-bit, the call address can
// only target EAX, EDX, or ECX since the tail call must be scheduled after
// callee-saved registers are restored. These happen to be the same
// registers used to pass 'inreg' arguments so watch out for those.
if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
!isa<ExternalSymbolSDNode>(Callee)) ||
PositionIndependent)) {
unsigned NumInRegs = 0;
// In PIC we need an extra register to formulate the address computation
// for the callee.
unsigned MaxInRegs = PositionIndependent ? 2 : 3;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (!VA.isRegLoc())
Register Reg = VA.getLocReg();
switch (Reg) {
default: break;
case X86::EAX: case X86::EDX: case X86::ECX:
if (++NumInRegs == MaxInRegs)
return false;
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
return false;
bool CalleeWillPop =
X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
if (unsigned BytesToPop =
MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
// If we have bytes to pop, the callee must pop them.
bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
if (!CalleePopMatches)
return false;
} else if (CalleeWillPop && StackArgsSize > 0) {
// If we don't have bytes to pop, make sure the callee doesn't pop any.
return false;
return true;
FastISel *
X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
return X86::createFastISel(funcInfo, libInfo);
// Other Lowering Hooks
static bool MayFoldLoad(SDValue Op) {
return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
static bool MayFoldIntoStore(SDValue Op) {
return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
static bool MayFoldIntoZeroExtend(SDValue Op) {
if (Op.hasOneUse()) {
unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
return (ISD::ZERO_EXTEND == Opcode);
return false;
static bool isTargetShuffle(unsigned Opcode) {
switch(Opcode) {
default: return false;
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::SHUFP:
case X86ISD::EXTRQI:
case X86ISD::VALIGN:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERM2X128:
case X86ISD::SHUF128:
case X86ISD::VPERMIL2:
case X86ISD::VPERMI:
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
return true;
static bool isTargetShuffleVariableMask(unsigned Opcode) {
switch (Opcode) {
default: return false;
// Target Shuffles.
case X86ISD::PSHUFB:
case X86ISD::VPERMIL2:
case X86ISD::VPPERM:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
return true;
// 'Faux' Target Shuffles.
case ISD::OR:
case ISD::AND:
case X86ISD::ANDNP:
return true;
static bool isTargetShuffleSplat(SDValue Op) {
unsigned Opcode = Op.getOpcode();
return isTargetShuffleSplat(Op.getOperand(0));
return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
if (ReturnAddrIndex == 0) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
bool hasSymbolicDisplacement) {
// Offset should fit into 32 bit immediate field.
if (!isInt<32>(Offset))
return false;
// If we don't have a symbolic displacement - we don't have any extra
// restrictions.
if (!hasSymbolicDisplacement)
return true;
// FIXME: Some tweaks might be needed for medium code model.
if (M != CodeModel::Small && M != CodeModel::Kernel)
return false;
// For small code model we assume that latest object is 16MB before end of 31
// bits boundary. We may also accept pretty large negative constants knowing
// that all objects are in the positive half of address space.
if (M == CodeModel::Small && Offset < 16*1024*1024)
return true;
// For kernel code model we know that all object resist in the negative half
// of 32bits address space. We may not accept negative offsets, since they may
// be just off and we may accept pretty large positive ones.
if (M == CodeModel::Kernel && Offset >= 0)
return true;
return false;
/// Determines whether the callee is required to pop its own arguments.
/// Callee pop is necessary to support tail calls.
bool X86::isCalleePop(CallingConv::ID CallingConv,
bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
// If GuaranteeTCO is true, we force some calls to be callee pop so that we
// can guarantee TCO.
if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
return true;
switch (CallingConv) {
return false;
case CallingConv::X86_StdCall:
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::X86_VectorCall:
return !is64Bit;
/// Return true if the condition is an signed comparison operation.
static bool isX86CCSigned(unsigned X86CC) {
switch (X86CC) {
llvm_unreachable("Invalid integer condition!");
case X86::COND_E:
case X86::COND_NE:
case X86::COND_B:
case X86::COND_A:
case X86::COND_BE:
case X86::COND_AE:
return false;
case X86::COND_G:
case X86::COND_GE:
case X86::COND_L:
case X86::COND_LE:
return true;
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
switch (SetCCOpcode) {
default: llvm_unreachable("Invalid integer condition!");
case ISD::SETEQ: return X86::COND_E;
case ISD::SETGT: return X86::COND_G;
case ISD::SETGE: return X86::COND_GE;
case ISD::SETLT: return X86::COND_L;
case ISD::SETLE: return X86::COND_LE;
case ISD::SETNE: return X86::COND_NE;
case ISD::SETULT: return X86::COND_B;
case ISD::SETUGT: return X86::COND_A;
case ISD::SETULE: return X86::COND_BE;
case ISD::SETUGE: return X86::COND_AE;
/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
/// condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
bool isFP, SDValue &LHS, SDValue &RHS,
SelectionDAG &DAG) {
if (!isFP) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
// X > -1 -> X == 0, jump !sign.
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_NS;
if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
// X < 0 -> X == 0, jump on sign.
return X86::COND_S;
if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
// X >= 0 -> X == 0, jump on !sign.
return X86::COND_NS;
if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_LE;
return TranslateIntegerX86CC(SetCCOpcode);
// First determine if it is required or is profitable to flip the operands.
// If LHS is a foldable load, but RHS is not, flip the condition.
if (ISD::isNON_EXTLoad(LHS.getNode()) &&
!ISD::isNON_EXTLoad(RHS.getNode())) {
SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
std::swap(LHS, RHS);
switch (SetCCOpcode) {
default: break;
std::swap(LHS, RHS);
// On a floating point condition, the flags are set as follows:
// ZF PF CF op
// 0 | 0 | 0 | X > Y
// 0 | 0 | 1 | X < Y
// 1 | 0 | 0 | X == Y
// 1 | 1 | 1 | unordered
switch (SetCCOpcode) {
default: llvm_unreachable("Condcode should be pre-legalized away");
case ISD::SETEQ: return X86::COND_E;
case ISD::SETOLT: // flipped
case ISD::SETGT: return X86::COND_A;
case ISD::SETOLE: // flipped
case ISD::SETGE: return X86::COND_AE;
case ISD::SETUGT: // flipped
case ISD::SETLT: return X86::COND_B;
case ISD::SETUGE: // flipped
case ISD::SETLE: return X86::COND_BE;
case ISD::SETNE: return X86::COND_NE;
case ISD::SETUO: return X86::COND_P;
case ISD::SETO: return X86::COND_NP;
case ISD::SETUNE: return X86::COND_INVALID;
/// Is there a floating point cmov for the specific X86 condition code?
/// Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
static bool hasFPCMov(unsigned X86CC) {
switch (X86CC) {
return false;
case X86::COND_B:
case X86::COND_BE:
case X86::COND_E:
case X86::COND_P:
case X86::COND_A:
case X86::COND_AE:
case X86::COND_NE:
case X86::COND_NP:
return true;
bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
if (!IntrData)
return false;
Info.flags = MachineMemOperand::MONone;
Info.offset = 0;
switch (IntrData->Type) {
Info.ptrVal = I.getArgOperand(0);
MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
ScalarVT = MVT::i8;
else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
ScalarVT = MVT::i16;
else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
ScalarVT = MVT::i32;
Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
Info.align = Align(1);
Info.flags |= MachineMemOperand::MOStore;
case GATHER:
case GATHER_AVX2: {
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = Align(1);
Info.flags |= MachineMemOperand::MOLoad;
case SCATTER: {
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = Align(1);
Info.flags |= MachineMemOperand::MOStore;
return false;
return true;
/// Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const {
for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
return true;
return false;
bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
// relocation target a movq or addq instruction: don't let the load shrink.
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
// those uses are extracted directly into a store, then the extract + store
// can be store-folded. Therefore, it's probably not worth splitting the load.
EVT VT = Load->getValueType(0);
if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
// Skip uses of the chain value. Result 0 of the node is the load value.
if (UI.getUse().getResNo() != 0)
// If this use is not an extract + store, it's probably worth splitting.
if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
UI->use_begin()->getOpcode() != ISD::STORE)
return true;
// All non-chain uses are extract + store.
return false;
return true;
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0 || BitSize > 64)
return false;
return true;
bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
// If we are using XMM registers in the ABI and the condition of the select is
// a floating-point compare and we have blendv or conditional move, then it is
// cheaper to select instead of doing a cross-register move and creating a
// load that depends on the compare result.
bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
// TODO: It might be a win to ease or lift this restriction, but the generic
// folds in DAGCombiner conflict with vector folds for an AVX512 target.
if (VT.isVector() && Subtarget.hasAVX512())
return false;
return true;
bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const {
// TODO: We handle scalars using custom code, but generic combining could make
// that unnecessary.
APInt MulC;
if (!ISD::isConstantSplatVector(C.getNode(), MulC))
return false;
// Find the type this will be legalized too. Otherwise we might prematurely
// convert this to shl+add/sub and then still have to type legalize those ops.
// Another choice would be to defer the decision for illegal types until
// after type legalization. But constant splat vectors of i64 can't make it
// through type legalization on 32-bit targets so we would need to special
// case vXi64.
while (getTypeAction(Context, VT) != TypeLegal)
VT = getTypeToTransformTo(Context, VT);
// If vector multiply is legal, assume that's faster than shl + add/sub.
// TODO: Multiply is a complex op with higher latency and lower throughput in
// most implementations, so this check could be loosened based on type
// and/or a CPU attribute.
if (isOperationLegal(ISD::MUL, VT))
return false;
// shl+add, shl+sub, shl+add+neg
return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
// Mask vectors support all subregister combinations and operations that
// extract half of vector.
if (ResVT.getVectorElementType() == MVT::i1)
return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
(Index == ResVT.getVectorNumElements()));
return (Index % ResVT.getVectorNumElements()) == 0;
bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
unsigned Opc = VecOp.getOpcode();
// Assume target opcodes can't be scalarized.
// TODO - do we have any exceptions?
return false;
// If the vector op is not supported, try to convert to scalar.
EVT VecVT = VecOp.getValueType();
if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
return true;
// If the vector op is supported, but the scalar op is not, the transform may
// not be worthwhile.
EVT ScalarVT = VecVT.getScalarType();
return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
bool) const {
// TODO: Allow vectors?
if (VT.isVector())
return false;
return VT.isSimple() || !isOperationExpand(Opcode, VT);
bool X86TargetLowering::isCheapToSpeculateCttz() const {
// Speculate cttz only if we can directly use TZCNT.
return Subtarget.hasBMI();
bool X86TargetLowering::isCheapToSpeculateCtlz() const {
// Speculate ctlz only if we can directly use LZCNT.
return Subtarget.hasLZCNT();
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
const SelectionDAG &DAG,
const MachineMemOperand &MMO) const {
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
BitcastVT.getVectorElementType() == MVT::i1)
return false;
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
// If both types are legal vectors, it's always ok to convert them.
if (LoadVT.isVector() && BitcastVT.isVector() &&
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
return true;
return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
const SelectionDAG &DAG) const {
// Do not merge to float value size (128 bytes) if no implicit
// float attribute is set.
bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
if (NoFloat) {
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
return (MemVT.getSizeInBits() <= MaxIntSize);
// Make sure we don't merge greater than our preferred vector
// width.
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
return false;
return true;
bool X86TargetLowering::isCtlzFast() const {
return Subtarget.hasFastLZCNT();
bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
return true;
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
EVT VT = Y.getValueType();
if (VT.isVector())
return false;
if (!Subtarget.hasBMI())
return false;
// There are only 32-bit and 64-bit forms for 'andn'.
if (VT != MVT::i32 && VT != MVT::i64)
return false;
return !isa<ConstantSDNode>(Y);
bool X86TargetLowering::hasAndNot(SDValue Y) const {
EVT VT = Y.getValueType();
if (!VT.isVector())
return hasAndNotCompare(Y);
// Vector.
if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
return false;
if (VT == MVT::v4i32)
return true;
return Subtarget.hasSSE2();
bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
return X.getValueType().isScalarInteger(); // 'bt'
bool X86TargetLowering::
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
SelectionDAG &DAG) const {
// Does baseline recommend not to perform the fold by default?
if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
return false;
// For scalars this transform is always beneficial.
if (X.getValueType().isScalarInteger())
return true;
// If all the shift amounts are identical, then transform is beneficial even
// with rudimentary SSE2 shifts.
if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
return true;
// If we have AVX2 with it's powerful shift operations, then it's also good.
if (Subtarget.hasAVX2())
return true;
// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
return NewShiftOpcode == ISD::SHL;
bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
assert(((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) ||
(N->getOpcode() == ISD::SRL &&
N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask");
EVT VT = N->getValueType(0);
if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
// Only fold if the shift values are equal - so it folds to AND.
// TODO - we should fold if either is a non-uniform vector but we don't do
// the fold for non-splats yet.
return N->getOperand(1) == N->getOperand(0).getOperand(1);
return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
EVT VT = Y.getValueType();
// For vectors, we don't have a preference, but we probably want a mask.
if (VT.isVector())
return false;
// 64-bit shifts on 32-bit targets produce really bad bloated code.
if (VT == MVT::i64 && !Subtarget.is64Bit())
return false;
return true;
bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
SDNode *N) const {
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
return false;
return true;
bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
// Any legal vector type can be splatted more efficiently than
// loading/spilling from memory.
return isTypeLegal(VT);
MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
MVT VT = MVT::getIntegerVT(NumBits);
if (isTypeLegal(VT))
return VT;
// PMOVMSKB can handle this.
if (NumBits == 128 && isTypeLegal(MVT::v16i8))
return MVT::v16i8;
// VPMOVMSKB can handle this.
if (NumBits == 256 && isTypeLegal(MVT::v32i8))
return MVT::v32i8;
// TODO: Allow 64-bit type for 32-bit target.
// TODO: 512-bit types should be allowed, but make sure that those
// cases are handled in combineVectorSizedSetCCEquality().
/// Val is the undef sentinel value or equal to the specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
return ((Val == SM_SentinelUndef) || (Val == CmpVal));
/// Val is either the undef or zero sentinel value.
static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
/// Return true if every element in Mask, beginning from position Pos and ending
/// in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
return llvm::all_of(Mask.slice(Pos, Size),
[](int M) { return M == SM_SentinelUndef; });
/// Return true if the mask creates a vector whose lower half is undefined.
static bool isUndefLowerHalf(ArrayRef<int> Mask) {
unsigned NumElts = Mask.size();
return isUndefInRange(Mask, 0, NumElts / 2);
/// Return true if the mask creates a vector whose upper half is undefined.
static bool isUndefUpperHalf(ArrayRef<int> Mask) {
unsigned NumElts = Mask.size();
return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
/// Return true if Val falls within the specified range (L, H].
static bool isInRange(int Val, int Low, int Hi) {
return (Val >= Low && Val < Hi);
/// Return true if the value of any element in Mask falls within the specified
/// range (L, H].
static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
/// Return true if the value of any element in Mask is the zero sentinel value.
static bool isAnyZero(ArrayRef<int> Mask) {
return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
/// Return true if the value of any element in Mask is the zero or undef
/// sentinel values.
static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
return llvm::any_of(Mask, [](int M) {
return M == SM_SentinelZero || M == SM_SentinelUndef;
/// Return true if Val is undef or if its value falls within the
/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
/// Return true if every element in Mask is undef or if its value
/// falls within the specified range (L, H].
static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
return llvm::all_of(
Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
/// Return true if Val is undef, zero or if its value falls within the
/// specified range (L, H].
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
/// Return true if every element in Mask is undef, zero or if its value
/// falls within the specified range (L, H].
static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
return llvm::all_of(
Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos + Size, falls within the specified
/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size, int Low, int Step = 1) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrEqual(Mask[i], Low))
return false;
return true;
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (Low, Low+Size], or is undef or is zero.
static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size, int Low,
int Step = 1) {
for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
return false;
return true;
/// Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size is undef or is zero.
static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size) {
return llvm::all_of(Mask.slice(Pos, Size),
[](int M) { return isUndefOrZero(M); });
/// Helper function to test whether a shuffle mask could be
/// simplified by widening the elements being shuffled.
/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
/// leaves it in an unspecified state.
/// NOTE: This must handle normal vector shuffle masks and *target* vector
/// shuffle masks. The latter have the special property of a '-2' representing
/// a zero-ed lane of a vector.
static bool canWidenShuffleElements(ArrayRef<int> Mask,
SmallVectorImpl<int> &WidenedMask) {
WidenedMask.assign(Mask.size() / 2, 0);
for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
int M0 = Mask[i];
int M1 = Mask[i + 1];
// If both elements are undef, its trivial.
if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
WidenedMask[i / 2] = SM_SentinelUndef;
// Check for an undef mask and a mask value properly aligned to fit with
// a pair of values. If we find such a case, use the non-undef mask's value.
if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
WidenedMask[i / 2] = M1 / 2;
if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
WidenedMask[i / 2] = M0 / 2;
// When zeroing, we need to spread the zeroing across both lanes to widen.
if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
WidenedMask[i / 2] = SM_SentinelZero;
return false;
// Finally check if the two mask values are adjacent and aligned with
// a pair.
if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
WidenedMask[i / 2] = M0 / 2;
// Otherwise we can't safely widen the elements used in this shuffle.
return false;
assert(WidenedMask.size() == Mask.size() / 2 &&
"Incorrect size of mask after widening the elements!");
return true;
static bool canWidenShuffleElements(ArrayRef<int> Mask,
const APInt &Zeroable,
bool V2IsZero,
SmallVectorImpl<int> &WidenedMask) {
// Create an alternative mask with info about zeroable elements.
// Here we do not set undef elements as zeroable.
SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
if (V2IsZero) {
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
for (int i = 0, Size = Mask.size(); i != Size; ++i)
if (Mask[i] != SM_SentinelUndef && Zeroable[i])
ZeroableMask[i] = SM_SentinelZero;
return canWidenShuffleElements(ZeroableMask, WidenedMask);
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
SmallVector<int, 32> WidenedMask;
return canWidenShuffleElements(Mask, WidenedMask);
// Attempt to narrow/widen shuffle mask until it matches the target number of
// elements.
static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
SmallVectorImpl<int> &ScaledMask) {
unsigned NumSrcElts = Mask.size();
assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
"Illegal shuffle scale factor");
// Narrowing is guaranteed to work.
if (NumDstElts >= NumSrcElts) {
int Scale = NumDstElts / NumSrcElts;
llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
return true;
// We have to repeat the widening until we reach the target size, but we can
// split out the first widening as it sets up ScaledMask for us.
if (canWidenShuffleElements(Mask, ScaledMask)) {
while (ScaledMask.size() > NumDstElts) {
SmallVector<int, 16> WidenedMask;
if (!canWidenShuffleElements(ScaledMask, WidenedMask))
return false;
ScaledMask = std::move(WidenedMask);
return true;
return false;
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
// Build a vector of constants.
// Use an UNDEF node if MaskElt == -1.
// Split 64-bit constants in the 32-bit mode.
static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
const SDLoc &dl, bool IsMask = false) {
SmallVector<SDValue, 32> Ops;
bool Split = false;
MVT ConstVecVT = VT;
unsigned NumElts = VT.getVectorNumElements();
bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
Split = true;
MVT EltVT = ConstVecVT.getVectorElementType();
for (unsigned i = 0; i < NumElts; ++i) {
bool IsUndef = Values[i] < 0 && IsMask;
SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
DAG.getConstant(Values[i], dl, EltVT);
if (Split)
Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
DAG.getConstant(0, dl, EltVT));
SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
if (Split)
ConstsNode = DAG.getBitcast(VT, ConstsNode);
return ConstsNode;
static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert(Bits.size() == Undefs.getBitWidth() &&
"Unequal constant and undef arrays");
SmallVector<SDValue, 32> Ops;
bool Split = false;
MVT ConstVecVT = VT;
unsigned NumElts = VT.getVectorNumElements();
bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
Split = true;
MVT EltVT = ConstVecVT.getVectorElementType();
for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
if (Undefs[i]) {
Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
const APInt &V = Bits[i];
assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
if (Split) {
Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
} else if (EltVT == MVT::f32) {
APFloat FV(APFloat::IEEEsingle(), V);
Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else if (EltVT == MVT::f64) {
APFloat FV(APFloat::IEEEdouble(), V);
Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
} else {
Ops.push_back(DAG.getConstant(V, dl, EltVT));
SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
return DAG.getBitcast(VT, ConstsNode);
/// Returns a vector of specified type with all zero elements.
static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
SelectionDAG &DAG, const SDLoc &dl) {
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
VT.getVectorElementType() == MVT::i1) &&
"Unexpected vector type");
// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
// type. This ensures they get CSE'd. But if the integer type is not
// available, use a floating-point +0.0 instead.
SDValue Vec;
if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
} else if (VT.isFloatingPoint()) {
Vec = DAG.getConstantFP(+0.0, dl, VT);
} else if (VT.getVectorElementType() == MVT::i1) {
assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type");
Vec = DAG.getConstant(0, dl, VT);
} else {
unsigned Num32BitElts = VT.getSizeInBits() / 32;
Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
return DAG.getBitcast(VT, Vec);
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
const SDLoc &dl, unsigned vectorWidth) {
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
unsigned Factor = VT.getSizeInBits()/vectorWidth;
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
IdxVal &= ~(ElemsPerChunk - 1);
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(ResultVT, dl,
Vec->ops().slice(IdxVal, ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
/// instructions or a simple subregister reference. Idx is an index in the
/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert((Vec.getValueType().is256BitVector() ||
Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
return extractSubVector(Vec, IdxVal, DAG, dl, 128);
/// Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
return extractSubVector(Vec, IdxVal, DAG, dl, 256);
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl,
unsigned vectorWidth) {
assert((vectorWidth == 128 || vectorWidth == 256) &&
"Unsupported vector width");
// Inserting UNDEF is Result
if (Vec.isUndef())
return Result;
EVT VT = Vec.getValueType();
EVT ElVT = VT.getVectorElementType();
EVT ResultVT = Result.getValueType();
// Insert the relevant vectorWidth bits.
unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
IdxVal &= ~(ElemsPerChunk - 1);
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
/// Generate a DAG to put 128-bits into a vector > 128 bits. This
/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
/// simple superregister reference. Idx is an index in the 128 bits
/// we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
/// Widen a vector to a larger size with the same scalar type, with the new
/// elements either zero or undef.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl) {
assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type");
SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
DAG.getIntPtrConstant(0, dl));
/// Widen a vector to a larger size with the same scalar type, with the new
/// elements either zero or undef.
static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl, unsigned WideSizeInBits) {
assert(Vec.getValueSizeInBits() < WideSizeInBits &&
(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
"Unsupported vector widening type");
unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
MVT SVT = Vec.getSimpleValueType().getScalarType();
MVT VT = MVT::getVectorVT(SVT, WideNumElts);
return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
// Helper function to collect subvector ops that are concatenated together,
// The subvectors in Ops are guaranteed to be the same type.
static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
assert(Ops.empty() && "Expected an empty ops vector");
if (N->getOpcode() == ISD::CONCAT_VECTORS) {
Ops.append(N->op_begin(), N->op_end());
return true;
if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Src = N->getOperand(0);
SDValue Sub = N->getOperand(1);
const APInt &Idx = N->getConstantOperandAPInt(2);
EVT VT = Src.getValueType();
EVT SubVT = Sub.getValueType();
// TODO - Handle more general insert_subvector chains.
if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
Idx == (VT.getVectorNumElements() / 2)) {
// insert_subvector(insert_subvector(undef, x, lo), y, hi)
if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
Src.getOperand(1).getValueType() == SubVT &&
isNullConstant(Src.getOperand(2))) {
return true;
// insert_subvector(x, extract_subvector(x, lo), hi)
if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
Ops.append(2, Sub);
return true;
return false;
static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
const SDLoc &dl) {
EVT VT = Op.getValueType();
unsigned NumElems = VT.getVectorNumElements();
unsigned SizeInBits = VT.getSizeInBits();
assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
"Can't split odd sized vector");
SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
return std::make_pair(Lo, Hi);
// Split an unary integer op into 2 half sized ops.
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
// Make sure we only try to split 256/512-bit types to avoid creating
// narrow vectors.
assert((Op.getOperand(0).getValueType().is256BitVector() ||
Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
VT.getVectorNumElements() &&
"Unexpected VTs!");
SDLoc dl(Op);
// Extract the Lo/Hi vectors
SDValue Lo, Hi;
std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
/// Break a binary integer operation into 2 half sized ops and then
/// concatenate the result back.
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
// Sanity check that all the types match.
assert(Op.getOperand(0).getValueType() == VT &&
Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
SDLoc dl(Op);
// Extract the LHS Lo/Hi vectors
SDValue LHS1, LHS2;
std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
// Extract the RHS Lo/Hi vectors
SDValue RHS1, RHS2;
std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
// Helper for splitting operands of an operation to legal target size and
// apply a function on each part.
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
// The argument Builder is a function that will be applied on each split part:
// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
template <typename F>
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
F Builder, bool CheckBWI = true) {
assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
unsigned NumSubs = 1;
if ((CheckBWI && Subtarget.useBWIRegs()) ||
(!CheckBWI && Subtarget.useAVX512Regs())) {
if (VT.getSizeInBits() > 512) {
NumSubs = VT.getSizeInBits() / 512;
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
} else if (Subtarget.hasAVX2()) {
if (VT.getSizeInBits() > 256) {
NumSubs = VT.getSizeInBits() / 256;
assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
} else {
if (VT.getSizeInBits() > 128) {
NumSubs = VT.getSizeInBits() / 128;
assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
if (NumSubs == 1)
return Builder(DAG, DL, Ops);
SmallVector<SDValue, 4> Subs;
for (unsigned i = 0; i != NumSubs; ++i) {
SmallVector<SDValue, 2> SubOps;
for (SDValue Op : Ops) {
EVT OpVT = Op.getValueType();
unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
Subs.push_back(Builder(DAG, DL, SubOps));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
/// Insert i1-subvector to i1-vector.
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
SDValue SubVec = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
unsigned IdxVal = Op.getConstantOperandVal(2);
// Inserting undef is a nop. We can just return the original vector.
if (SubVec.isUndef())
return Vec;
if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
return Op;
MVT OpVT = Op.getSimpleValueType();
unsigned NumElems = OpVT.getVectorNumElements();
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
// Extend to natively supported kshift.
MVT WideOpVT = OpVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
// if necessary.
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
// May need to promote to a legal type.
DAG.getConstant(0, dl, WideOpVT),
SubVec, Idx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
MVT SubVecVT = SubVec.getSimpleValueType();
unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
assert(IdxVal + SubVecNumElems <= NumElems &&
IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR");
SDValue Undef = DAG.getUNDEF(WideOpVT);
if (IdxVal == 0) {
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
DAG.getConstant(0, dl, WideOpVT),
SubVec, ZeroIdx);
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, SubVec, ZeroIdx);
if (Vec.isUndef()) {
assert(IdxVal != 0 && "Unexpected index");
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
assert(IdxVal != 0 && "Unexpected index");
NumElems = WideOpVT.getVectorNumElements();
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
if (ShiftRight != 0)
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
if (SubVecNumElems * 2 == NumElems) {
// Special case, use legal zero extending insert_subvector. This allows
// isel to optimize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
DAG.getConstant(0, dl, WideOpVT),
Vec, ZeroIdx);
} else {
// Otherwise use explicit shifts to zero the bits.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, Vec, ZeroIdx);
NumElems = WideOpVT.getVectorNumElements();
SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
// Inserting into the middle is more complicated.
NumElems = WideOpVT.getVectorNumElements();
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
// Do an optimization for the the most frequently used types.
if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
// Reduce to original width if needed.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
// Clear the upper bits of the subvector and move it to its insert position.
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
// Isolate the bits below the insertion point.
unsigned LowShift = NumElems - IdxVal;
SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
DAG.getTargetConstant(LowShift, dl, MVT::i8));
Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
DAG.getTargetConstant(LowShift, dl, MVT::i8));
// Isolate the bits after the last inserted bit.
unsigned HighShift = IdxVal + SubVecNumElems;
SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getTargetConstant(HighShift, dl, MVT::i8));
High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
DAG.getTargetConstant(HighShift, dl, MVT::i8));
// Now OR all 3 pieces together.
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
// Reduce to original width if needed.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
const SDLoc &dl) {
assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
EVT SubVT = V1.getValueType();
EVT SubSVT = SubVT.getScalarType();
unsigned SubNumElts = SubVT.getVectorNumElements();
unsigned SubVectorWidth = SubVT.getSizeInBits();
EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
/// Returns a vector of specified type with all bits set.
/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
/// Then bitcast to their original type, ensuring they get CSE'd.
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Expected a 128/256/512-bit vector type");
APInt Ones = APInt::getAllOnesValue(32);
unsigned NumElts = VT.getSizeInBits() / 32;
SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
return DAG.getBitcast(VT, Vec);
// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
switch (Opcode) {
llvm_unreachable("Unknown opcode");
static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
SDValue In, SelectionDAG &DAG) {
EVT InVT = In.getValueType();
assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode");
// For 256-bit vectors, we only need the lower (128-bit) input half.
// For 512-bit vectors, we only need the lower input half or quarter.
if (InVT.getSizeInBits() > 128) {
assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
"Expected VTs to be the same size!");
unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
In = extractSubVector(In, 0, DAG, DL,
std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
InVT = In.getValueType();
if (VT.getVectorNumElements() != InVT.getVectorNumElements())
Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
return DAG.getNode(Opcode, DL, VT, In);
// Match (xor X, -1) -> X.
// Match extract_subvector(xor X, -1) -> extract_subvector(X).
// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
return V.getOperand(0);
if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
Not, V.getOperand(1));
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps)) {
for (SDValue &CatOp : CatOps) {
SDValue NotCat = IsNOT(CatOp, DAG);
if (!NotCat) return SDValue();
CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
return SDValue();
void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
bool Lo, bool Unary) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
int NumElts = VT.getVectorNumElements();
int NumEltsInLane = 128 / VT.getScalarSizeInBits();
for (int i = 0; i < NumElts; ++i) {
unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
int Pos = (i % NumEltsInLane) / 2 + LaneStart;
Pos += (Unary ? 0 : NumElts * (i % 2));
Pos += (Lo ? 0 : NumEltsInLane / 2);
/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
/// imposed by AVX and specific to the unary pattern. Example:
/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
bool Lo) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
int NumElts = VT.getVectorNumElements();
for (int i = 0; i < NumElts; ++i) {
int Pos = i / 2;
Pos += (Lo ? 0 : NumElts / 2);
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
/// Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
/// Return a vector_shuffle of the specified vector of zero or undef vector.
/// This produces a shuffle where the low element of V2 is swizzled into the
/// zero/undef vector, landing at element Idx.
/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
bool IsZero,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = V2.getSimpleValueType();
SDValue V1 = IsZero
? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
int NumElems = VT.getVectorNumElements();
SmallVector<int, 16> MaskVec(NumElems);
for (int i = 0; i != NumElems; ++i)
// If this is the insertion idx, put the low elt of V2 here.
MaskVec[i] = (i == Idx) ? NumElems : i;
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
if (Ptr.getOpcode() == X86ISD::Wrapper ||
Ptr.getOpcode() == X86ISD::WrapperRIP)
Ptr = Ptr.getOperand(0);
auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
return nullptr;
return CNode->getConstVal();
static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
if (!Load || !ISD::isNormalLoad(Load))
return nullptr;
return getTargetConstantFromBasePtr(Load->getBasePtr());
static const Constant *getTargetConstantFromNode(SDValue Op) {
Op = peekThroughBitcasts(Op);
return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
const Constant *
X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
assert(LD && "Unexpected null LoadSDNode");
return getTargetConstantFromNode(LD);
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
APInt &UndefElts,
SmallVectorImpl<APInt> &EltBits,
bool AllowWholeUndefs = true,
bool AllowPartialUndefs = true) {
assert(EltBits.empty() && "Expected an empty EltBits vector");
Op = peekThroughBitcasts(Op);
EVT VT = Op.getValueType();
unsigned SizeInBits = VT.getSizeInBits();
assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
// Bitcast a source array of element bits to the target size.
auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
unsigned NumSrcElts = UndefSrcElts.getBitWidth();
unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
"Constant bit sizes don't match");
// Don't split if we don't allow undef bits.
bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
if (UndefSrcElts.getBoolValue() && !AllowUndefs)
return false;
// If we're already the right size, don't bother bitcasting.
if (NumSrcElts == NumElts) {
UndefElts = UndefSrcElts;
EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
return true;
// Extract all the undef/constant element data and pack into single bitsets.
APInt UndefBits(SizeInBits, 0);
APInt MaskBits(SizeInBits, 0);
for (unsigned i = 0; i != NumSrcElts; ++i) {
unsigned BitOffset = i * SrcEltSizeInBits;
if (UndefSrcElts[i])
UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
MaskBits.insertBits(SrcEltBits[i], BitOffset);
// Split the undef/constant single bitset data into the target elements.
UndefElts = APInt(NumElts, 0);
EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
for (unsigned i = 0; i != NumElts; ++i) {
unsigned BitOffset = i * EltSizeInBits;
APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
// Only treat an element as UNDEF if all bits are UNDEF.
if (UndefEltBits.isAllOnesValue()) {
if (!AllowWholeUndefs)
return false;
// If only some bits are UNDEF then treat them as zero (or bail if not
// supported).
if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
return false;
EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
return true;
// Collect constant bits and insert into mask/undef bit masks.
auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
unsigned UndefBitIndex) {
if (!Cst)
return false;
if (isa<UndefValue>(Cst)) {
return true;
if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
Mask = CInt->getValue();
return true;
if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
Mask = CFP->getValueAPF().bitcastToAPInt();
return true;
return false;
// Handle UNDEFs.
if (Op.isUndef()) {
APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
return CastBitData(UndefSrcElts, SrcEltBits);
// Extract scalar constant bits.
if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
APInt UndefSrcElts = APInt::getNullValue(1);
SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
return CastBitData(UndefSrcElts, SrcEltBits);
if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
APInt UndefSrcElts = APInt::getNullValue(1);
APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
SmallVector<APInt, 64> SrcEltBits(1, RawBits);
return CastBitData(UndefSrcElts, SrcEltBits);
// Extract constant bits from build vector.
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
const SDValue &Src = Op.getOperand(i);
if (Src.isUndef()) {
auto *Cst = cast<ConstantSDNode>(Src);
SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
return CastBitData(UndefSrcElts, SrcEltBits);
if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
const SDValue &Src = Op.getOperand(i);
if (Src.isUndef()) {
auto *Cst = cast<ConstantFPSDNode>(Src);
APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
return CastBitData(UndefSrcElts, SrcEltBits);
// Extract constant bits from constant pool vector.
if (auto *Cst = getTargetConstantFromNode(Op)) {
Type *CstTy = Cst->getType();
unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
return false;
unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
for (unsigned i = 0; i != NumSrcElts; ++i)
if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
UndefSrcElts, i))
return false;
return CastBitData(UndefSrcElts, SrcEltBits);
// Extract constant bits from a broadcasted constant pool scalar.
if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
EltSizeInBits <= VT.getScalarSizeInBits()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
return false;
SDValue Ptr = MemIntr->getBasePtr();
if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
if (UndefSrcElts[0])
UndefSrcElts.setBits(0, NumSrcElts);
SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
return CastBitData(UndefSrcElts, SrcEltBits);
// Extract constant bits from a subvector broadcast.
if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
SmallVector<APInt, 16> SubEltBits;
if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, SubEltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
UndefElts = APInt::getSplat(NumElts, UndefElts);
while (EltBits.size() < NumElts)
EltBits.append(SubEltBits.begin(), SubEltBits.end());
return true;
// Extract a rematerialized scalar constant insertion.
if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
APInt UndefSrcElts(NumSrcElts, 0);
SmallVector<APInt, 64> SrcEltBits;
auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
return CastBitData(UndefSrcElts, SrcEltBits);
// Insert constant bits from a base and sub vector sources.
if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
// TODO - support insert_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
APInt UndefSubElts;
SmallVector<APInt, 32> EltSubBits;
if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
UndefSubElts, EltSubBits,
AllowWholeUndefs, AllowPartialUndefs) &&
getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, EltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
unsigned BaseIdx = Op.getConstantOperandVal(2);
UndefElts.insertBits(UndefSubElts, BaseIdx);
for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
EltBits[BaseIdx + i] = EltSubBits[i];
return true;
// Extract constant bits from a subvector's source.
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
// TODO - support extract_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts, EltBits, AllowWholeUndefs,
AllowPartialUndefs)) {
EVT SrcVT = Op.getOperand(0).getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned NumSubElts = VT.getVectorNumElements();
unsigned BaseIdx = Op.getConstantOperandVal(1);
UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
if ((BaseIdx + NumSubElts) != NumSrcElts)
EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
if (BaseIdx != 0)
EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
return true;
// Extract constant bits from shuffle node sources.
if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
// TODO - support shuffle through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
ArrayRef<int> Mask = SVN->getMask();
if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
llvm::any_of(Mask, [](int M) { return M < 0; }))
return false;
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
if (isAnyInRange(Mask, 0, NumElts) &&
!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
UndefElts0, EltBits0, AllowWholeUndefs,
return false;
if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
UndefElts1, EltBits1, AllowWholeUndefs,
return false;
UndefElts = APInt::getNullValue(NumElts);
for (int i = 0; i != (int)NumElts; ++i) {
int M = Mask[i];
if (M < 0) {
} else if (M < (int)NumElts) {
if (UndefElts0[M])
} else {
if (UndefElts1[M - NumElts])
EltBits.push_back(EltBits1[M - NumElts]);
return true;
return false;
namespace llvm {
namespace X86 {
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
UndefElts, EltBits, true,
AllowPartialUndefs)) {
int SplatIndex = -1;
for (int i = 0, e = EltBits.size(); i != e; ++i) {
if (UndefElts[i])
if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
SplatIndex = -1;
SplatIndex = i;
if (0 <= SplatIndex) {
SplatVal = EltBits[SplatIndex];
return true;
return false;
} // namespace X86
} // namespace llvm
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
SmallVectorImpl<uint64_t> &RawMask,
APInt &UndefElts) {
// Extract the raw target constant bits.
SmallVector<APInt, 64> EltBits;
if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
EltBits, /* AllowWholeUndefs */ true,
/* AllowPartialUndefs */ false))
return false;
// Insert the extracted elements into the mask.
for (APInt Elt : EltBits)
return true;
/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
/// Note: This ignores saturation, so inputs must be checked first.
static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
bool Unary, unsigned NumStages = 1) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
unsigned Offset = Unary ? 0 : NumElts;
unsigned Repetitions = 1u << (NumStages - 1);
unsigned Increment = 1u << NumStages;
assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
Mask.push_back(Elt + (Lane * NumEltsPerLane));
for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
// Split the demanded elts of a PACKSS/PACKUS node between its operands.
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
APInt &DemandedLHS, APInt &DemandedRHS) {
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = DemandedElts.getBitWidth();
int NumInnerElts = NumElts / 2;
int NumEltsPerLane = NumElts / NumLanes;
int NumInnerEltsPerLane = NumInnerElts / NumLanes;
DemandedLHS = APInt::getNullValue(NumInnerElts);
DemandedRHS = APInt::getNullValue(NumInnerElts);
// Map DemandedElts to the packed operands.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
int OuterIdx = (Lane * NumEltsPerLane) + Elt;
int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
if (DemandedElts[OuterIdx])
if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
// Split the demanded elts of a HADD/HSUB node between its operands.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
APInt &DemandedLHS, APInt &DemandedRHS) {
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = DemandedElts.getBitWidth();
int NumEltsPerLane = NumElts / NumLanes;
int HalfEltsPerLane = NumEltsPerLane / 2;
DemandedLHS = APInt::getNullValue(NumElts);
DemandedRHS = APInt::getNullValue(NumElts);
// Map DemandedElts to the horizontal operands.
for (int Idx = 0; Idx != NumElts; ++Idx) {
if (!DemandedElts[Idx])
int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
int LocalIdx = Idx % NumEltsPerLane;
if (LocalIdx < HalfEltsPerLane) {
DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
} else {
LocalIdx -= HalfEltsPerLane;
DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
/// Sets \p IsUnary to true if only one source is used. Note that this will set
/// IsUnary for shuffles which use a single input multiple times, and in those
/// cases it will adjust the mask to only have indices within that single input.
/// It is an error to call this with non-empty Mask/Ops vectors.
static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
SmallVectorImpl<SDValue> &Ops,
SmallVectorImpl<int> &Mask, bool &IsUnary) {
unsigned NumElems = VT.getVectorNumElements();
unsigned MaskEltSize = VT.getScalarSizeInBits();
SmallVector<uint64_t, 32> RawMask;
APInt RawUndefs;
uint64_t ImmN;
assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
IsUnary = false;
bool IsFakeUnary = false;
switch (N->getOpcode()) {
case X86ISD::BLENDI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodeBLENDMask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
case X86ISD::SHUFP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodeINSERTPSMask(ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
case X86ISD::EXTRQI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
if (isa<ConstantSDNode>(N->getOperand(1)) &&
isa<ConstantSDNode>(N->getOperand(2))) {
int BitLen = N->getConstantOperandVal(1);
int BitIdx = N->getConstantOperandVal(2);
DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
IsUnary = true;
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
if (isa<ConstantSDNode>(N->getOperand(2)) &&
isa<ConstantSDNode>(N->getOperand(3))) {
int BitLen = N->getConstantOperandVal(2);
int BitIdx = N->getConstantOperandVal(3);
DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
case X86ISD::UNPCKH:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
case X86ISD::UNPCKL:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeMOVHLPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeMOVLHPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
case X86ISD::VALIGN:
assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
"Only 32-bit and 64-bit elements are supported!");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodeVALIGNMask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodePALIGNRMask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
case X86ISD::VSHLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodePSLLDQMask(NumElems, ImmN, Mask);
IsUnary = true;
case X86ISD::VSRLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodePSRLDQMask(NumElems, ImmN, Mask);
IsUnary = true;
case X86ISD::PSHUFD:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = true;
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodePSHUFHWMask(NumElems, ImmN, Mask);
IsUnary = true;
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodePSHUFLWMask(NumElems, ImmN, Mask);
IsUnary = true;
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeZeroMoveLowMask(NumElems, Mask);
IsUnary = true;
// We only decode broadcasts of same-sized vectors, peeking through to
// extracted subvectors is likely to cause hasOneUse issues with
// SimplifyDemandedBits etc.
if (N->getOperand(0).getValueType() == VT) {
DecodeVectorBroadcast(NumElems, Mask);
IsUnary = true;
return false;
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
return false;
case X86ISD::PSHUFB: {
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
DecodePSHUFBMask(RawMask, RawUndefs, Mask);
return false;
case X86ISD::VPERMI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodeVPERMMask(NumElems, ImmN, Mask);
IsUnary = true;
case X86ISD::MOVSS:
case X86ISD::MOVSD:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
case X86ISD::VPERM2X128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
case X86ISD::SHUF128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVSLDUPMask(NumElems, Mask);
IsUnary = true;
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVSHDUPMask(NumElems, Mask);
IsUnary = true;
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVDDUPMask(NumElems, Mask);
IsUnary = true;
case X86ISD::VPERMIL2: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
SDValue MaskNode = N->getOperand(2);
SDValue CtrlNode = N->getOperand(3);
if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
unsigned CtrlImm = CtrlOp->getZExtValue();
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
return false;
case X86ISD::VPPERM: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
SDValue MaskNode = N->getOperand(2);
if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
DecodeVPPERMMask(RawMask, RawUndefs, Mask);
return false;
case X86ISD::VPERMV: {
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = true;
// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
SDValue MaskNode = N->getOperand(0);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMVMask(RawMask, RawUndefs, Mask);
return false;
case X86ISD::VPERMV3: {
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
SDValue MaskNode = N->getOperand(1);
if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
RawUndefs)) {
DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
return false;
default: llvm_unreachable("unknown target shuffle node");
// Empty mask indicates the decode failed.
if (Mask.empty())
return false;
// Check if we're getting a shuffle mask with zero'd elements.
if (!AllowSentinelZero && isAnyZero(Mask))
return false;
// If we have a fake unary shuffle, the shuffle mask is spread across two
// inputs that are actually the same node. Re-map the mask to always point
// into the first input.
if (IsFakeUnary)
for (int &M : Mask)
if (M >= (int)Mask.size())
M -= Mask.size();
// If we didn't already add operands in the opcode-specific code, default to
// adding 1 or 2 operands starting at 0.
if (Ops.empty()) {
if (!IsUnary || IsFakeUnary)
return true;
/// Compute whether each element of a shuffle is zeroable.
/// A "zeroable" vector shuffle element is one which can be lowered to zero.
/// Either it is an undef element in the shuffle mask, the element of the input
/// referenced is undef, or the element of the input referenced is known to be
/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
/// as many lanes with this technique as possible to simplify the remaining
/// shuffle.
static void computeZeroableShuffleElements(ArrayRef<int> Mask,
SDValue V1, SDValue V2,
APInt &KnownUndef, APInt &KnownZero) {
int Size = Mask.size();
KnownUndef = KnownZero = APInt::getNullValue(Size);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
int VectorSizeInBits = V1.getValueSizeInBits();
int ScalarSizeInBits = VectorSizeInBits / Size;
assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
for (int i = 0; i < Size; ++i) {
int M = Mask[i];
// Handle the easy cases.
if (M < 0) {
if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
// Determine shuffle input and normalize the mask.
SDValue V = M < Size ? V1 : V2;
M %= Size;
// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
if (V.getOpcode() != ISD::BUILD_VECTOR)
// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
// the (larger) source element must be UNDEF/ZERO.
if ((Size % V.getNumOperands()) == 0) {
int Scale = Size / V->getNumOperands();
SDValue Op = V.getOperand(M / Scale);
if (Op.isUndef())
if (X86::isZeroNode(Op))
else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
APInt Val = Cst->getAPIntValue();
Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
if (Val == 0)
} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
APInt Val = Cst->getValueAPF().bitcastToAPInt();
Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
if (Val == 0)
// If the BUILD_VECTOR has more elements then all the (smaller) source
// elements must be UNDEF or ZERO.
if ((V.getNumOperands() % Size) == 0) {
int Scale = V->getNumOperands() / Size;
bool AllUndef = true;
bool AllZero = true;
for (int j = 0; j < Scale; ++j) {
SDValue Op = V.getOperand((M * Scale) + j);
AllUndef &= Op.isUndef();
AllZero &= X86::isZeroNode(Op);
if (AllUndef)
if (AllZero)
/// Decode a target shuffle mask and inputs and see if any values are
/// known to be undef or zero from their inputs.
/// Returns true if the target shuffle mask was decoded.
/// FIXME: Merge this with computeZeroableShuffleElements?
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
APInt &KnownUndef, APInt &KnownZero) {
bool IsUnary;
if (!isTargetShuffle(N.getOpcode()))
return false;
MVT VT = N.getSimpleValueType();
if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
return false;
int Size = Mask.size();
SDValue V1 = Ops[0];
SDValue V2 = IsUnary ? V1 : Ops[1];
KnownUndef = KnownZero = APInt::getNullValue(Size);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
assert((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type");
unsigned EltSizeInBits = VT.getSizeInBits() / Size;
// Extract known constant input data.
APInt UndefSrcElts[2];
SmallVector<APInt, 32> SrcEltBits[2];
bool IsSrcConstant[2] = {
getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
SrcEltBits[0], true, false),
getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
SrcEltBits[1], true, false)};
for (int i = 0; i < Size; ++i) {
int M = Mask[i];
// Already decoded as SM_SentinelZero / SM_SentinelUndef.
if (M < 0) {
assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
if (SM_SentinelUndef == M)
if (SM_SentinelZero == M)
// Determine shuffle input and normalize the mask.
unsigned SrcIdx = M / Size;
SDValue V = M < Size ? V1 : V2;
M %= Size;
// We are referencing an UNDEF input.
if (V.isUndef()) {
// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
// TODO: We currently only set UNDEF for integer types - floats use the same
// registers as vectors and many of the scalar folded loads rely on the
// SCALAR_TO_VECTOR pattern.
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
(Size % V.getValueType().getVectorNumElements()) == 0) {
int Scale = Size / V.getValueType().getVectorNumElements();
int Idx = M / Scale;
if (Idx != 0 && !VT.isFloatingPoint())
else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
// base vectors.
if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Vec = V.getOperand(0);
int NumVecElts = Vec.getValueType().getVectorNumElements();
if (Vec.isUndef() && Size == NumVecElts) {
int Idx = V.getConstantOperandVal(2);
int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
if (M < Idx || (Idx + NumSubElts) <= M)
// Attempt to extract from the source's constant bits.
if (IsSrcConstant[SrcIdx]) {
if (UndefSrcElts[SrcIdx][M])
else if (SrcEltBits[SrcIdx][M] == 0)
assert(VT.getVectorNumElements() == (unsigned)Size &&
"Different mask size from vector size!");
return true;
// Replace target shuffle mask elements with known undef/zero sentinels.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
const APInt &KnownUndef,
const APInt &KnownZero,
bool ResolveKnownZeros= true) {
unsigned NumElts = Mask.size();
assert(KnownUndef.getBitWidth() == NumElts &&
KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
for (unsigned i = 0; i != NumElts; ++i) {
if (KnownUndef[i])
Mask[i] = SM_SentinelUndef;
else if (ResolveKnownZeros && KnownZero[i])
Mask[i] = SM_SentinelZero;
// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
APInt &KnownUndef,
APInt &KnownZero) {
unsigned NumElts = Mask.size();
KnownUndef = KnownZero = APInt::getNullValue(NumElts);
for (unsigned i = 0; i != NumElts; ++i) {
int M = Mask[i];
if (SM_SentinelUndef == M)
if (SM_SentinelZero == M)
// Forward declaration (for getFauxShuffleMask recursive check).
// TODO: Use DemandedElts variant.
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts) {
MVT VT = N.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
unsigned NumSizeInBits = VT.getSizeInBits();
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
return false;
assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
unsigned NumSizeInBytes = NumSizeInBits / 8;
unsigned NumBytesPerElt = NumBitsPerElt / 8;
unsigned Opcode = N.getOpcode();
switch (Opcode) {
// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
Mask.append(ShuffleMask.begin(), ShuffleMask.end());
return true;
return false;
case ISD::AND:
case X86ISD::ANDNP: {
// Attempt to decode as a per-byte mask.
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
bool IsAndN = (X86ISD::ANDNP == Opcode);
uint64_t ZeroMask = IsAndN ? 255 : 0;
if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
return false;
for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
if (UndefElts[i]) {
const APInt &ByteBits = EltBits[i];
if (ByteBits != 0 && ByteBits != 255)
return false;
Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
Ops.push_back(IsAndN ? N1 : N0);
return true;
case ISD::OR: {
// Inspect each operand at the byte level. We can merge these into a
// blend shuffle mask if for each byte at least one is masked out (zero).
KnownBits Known0 =
DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
KnownBits Known1 =
DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
bool IsByteMask = true;
APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
if (LHS == 255 && RHS == 0)
else if (LHS == 255 && RHS == 255)
else if (!(LHS == 0 && RHS == 255))
IsByteMask = false;
if (IsByteMask) {
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
for (unsigned j = 0; j != NumBytesPerElt; ++j) {
unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
return true;
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
true) ||
!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
return false;
// Shuffle inputs must be the same size as the result.
if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
return VT.getSizeInBits() != Op.getValueSizeInBits();
return false;
if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
return VT.getSizeInBits() != Op.getValueSizeInBits();
return false;
size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (size_t i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
else if (Mask1[i] == SM_SentinelZero)
else if (Mask0[i] == SM_SentinelZero)
Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
return false;
Ops.append(SrcInputs0.begin(), SrcInputs0.end());
Ops.append(SrcInputs1.begin(), SrcInputs1.end());
return true;
SDValue Src = N.getOperand(0);
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
if (!N->isOnlyUserOf(Sub.getNode()))
return false;
uint64_t InsertIdx = N.getConstantOperandVal(2);
if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Sub.getOperand(0).getValueType() == VT) {
uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
for (int i = 0; i != (int)NumElts; ++i)
for (int i = 0; i != (int)NumSubElts; ++i)
Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
return true;
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
SubMask, DAG, Depth + 1, ResolveKnownElts))
return false;
// Subvector shuffle inputs must not be larger than the subvector.
if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
return false;
if (SubMask.size() != NumSubElts) {
assert(((SubMask.size() % NumSubElts) == 0 ||
(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
if ((NumSubElts % SubMask.size()) == 0) {
int Scale = NumSubElts / SubMask.size();
SmallVector<int,64> ScaledSubMask;
narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
SubMask = ScaledSubMask;
} else {
int Scale = SubMask.size() / NumSubElts;
NumSubElts = SubMask.size();
NumElts *= Scale;
InsertIdx *= Scale;
Ops.append(SubInputs.begin(), SubInputs.end());
for (int i = 0; i != (int)NumElts; ++i)
for (int i = 0; i != (int)NumSubElts; ++i) {
int M = SubMask[i];
if (0 <= M) {
int InputIdx = M / NumSubElts;
M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
Mask[i + InsertIdx] = M;
return true;
case X86ISD::PINSRB:
case X86ISD::PINSRW:
// Match against a insert_vector_elt/scalar_to_vector of an extract from a
// vector, for matching src/dst vector types.
SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
unsigned DstIdx = 0;
if (Opcode != ISD::SCALAR_TO_VECTOR) {
// Check we have an in-range constant insertion index.
if (!isa<ConstantSDNode>(N.getOperand(2)) ||
return false;
DstIdx = N.getConstantOperandVal(2);
// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
if (X86::isZeroNode(Scl)) {
for (unsigned i = 0; i != NumElts; ++i)
Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
return true;
// Peek through trunc/aext/zext.
// TODO: aext shouldn't require SM_SentinelZero padding.
// TODO: handle shift of scalars.
unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
while (Scl.getOpcode() == ISD::TRUNCATE ||
Scl.getOpcode() == ISD::ANY_EXTEND ||
Scl.getOpcode() == ISD::ZERO_EXTEND) {
Scl = Scl.getOperand(0);
MinBitsPerElt =
std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
if ((MinBitsPerElt % 8) != 0)
return false;
// Attempt to find the source vector the scalar was extracted from.
SDValue SrcExtract;
if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
Scl.getOpcode() == X86ISD::PEXTRW ||
Scl.getOpcode() == X86ISD::PEXTRB) &&
Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
SrcExtract = Scl;
if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
return false;
SDValue SrcVec = SrcExtract.getOperand(0);
EVT SrcVT = SrcVec.getValueType();
if (!SrcVT.getScalarType().isByteSized())
return false;
unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
unsigned DstByte = DstIdx * NumBytesPerElt;
MinBitsPerElt =
std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
// Create 'identity' byte level shuffle mask and then add inserted bytes.
if (Opcode == ISD::SCALAR_TO_VECTOR) {
Mask.append(NumSizeInBytes, SM_SentinelUndef);
} else {
for (int i = 0; i != (int)NumSizeInBytes; ++i)
Mask.push_back(NumSizeInBytes + i);
unsigned MinBytesPerElts = MinBitsPerElt / 8;
MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
for (unsigned i = 0; i != MinBytesPerElts; ++i)
Mask[DstByte + i] = SrcByte + i;
for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
Mask[DstByte + i] = SM_SentinelZero;
return true;
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type");
APInt EltsLHS, EltsRHS;
getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
// If we know input saturation won't happen we can treat this
// as a truncation shuffle.
if (Opcode == X86ISD::PACKSS) {
if ((!N0.isUndef() &&
DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
(!N1.isUndef() &&
DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
return false;
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
if ((!N0.isUndef() &&
!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
(!N1.isUndef() &&
!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
return false;
bool IsUnary = (N0 == N1);
if (!IsUnary)
createPackShuffleMask(VT, Mask, IsUnary);
return true;
case X86ISD::VTRUNC: {
SDValue Src = N.getOperand(0);
EVT SrcVT = Src.getValueType();
// Truncated source must be a simple vector.
if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
(SrcVT.getScalarSizeInBits() % 8) != 0)
return false;
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
for (unsigned i = 0; i != NumSrcElts; ++i)
Mask.push_back(i * Scale);
Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
return true;
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
uint64_t ShiftVal = N.getConstantOperandVal(1);
// Out of range bit shifts are guaranteed to be zero.
if (NumBitsPerElt <= ShiftVal) {
Mask.append(NumElts, SM_SentinelZero);
return true;
// We can only decode 'whole byte' bit shifts as shuffles.
if ((ShiftVal % 8) != 0)
uint64_t ByteShift = ShiftVal / 8;
// Clear mask to all zeros and insert the shifted byte indices.
Mask.append(NumSizeInBytes, SM_SentinelZero);
if (X86ISD::VSHLI == Opcode) {
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j] = i + j - ByteShift;
} else {
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j - ByteShift] = i + j;
return true;
case X86ISD::VROTLI:
case X86ISD::VROTRI: {
// We can only decode 'whole byte' bit rotates as shuffles.
uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
if ((RotateVal % 8) != 0)
return false;
int Offset = RotateVal / 8;
Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
for (int i = 0; i != (int)NumElts; ++i) {
int BaseIdx = i * NumBytesPerElt;
for (int j = 0; j != (int)NumBytesPerElt; ++j) {
Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
return true;
SDValue Src = N.getOperand(0);
if (!Src.getSimpleValueType().isVector())
return false;
Mask.append(NumElts, 0);
return true;
SDValue Src = N.getOperand(0);
EVT SrcVT = Src.getValueType();
// Extended source must be a simple vector.
if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
(SrcVT.getScalarSizeInBits() % 8) != 0)
return false;
bool IsAnyExtend =
DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
IsAnyExtend, Mask);
return true;
return false;
/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask) {
int MaskWidth = Mask.size();
SmallVector<SDValue, 16> UsedInputs;
for (int i = 0, e = Inputs.size(); i < e; ++i) {
int lo = UsedInputs.size() * MaskWidth;
int hi = lo + MaskWidth;
// Strip UNDEF input usage.
if (Inputs[i].isUndef())
for (int &M : Mask)
if ((lo <= M) && (M < hi))
M = SM_SentinelUndef;
// Check for unused inputs.
if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
for (int &M : Mask)
if (lo <= M)
M -= MaskWidth;
// Check for repeated inputs.
bool IsRepeat = false;
for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
if (UsedInputs[j] != Inputs[i])
for (int &M : Mask)
if (lo <= M)
M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
IsRepeat = true;
if (IsRepeat)
Inputs = UsedInputs;
/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
/// Returns true if the target shuffle mask was decoded.
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
APInt &KnownUndef, APInt &KnownZero,
const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts) {
EVT VT = Op.getValueType();
if (!VT.isSimple() || !VT.isVector())
return false;
if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
if (ResolveKnownElts)
resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
return true;
if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
ResolveKnownElts)) {
resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
return true;
return false;
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
const SelectionDAG &DAG, unsigned Depth = 0,
bool ResolveKnownElts = true) {
EVT VT = Op.getValueType();
if (!VT.isSimple() || !VT.isVector())
return false;
APInt KnownUndef, KnownZero;
unsigned NumElts = Op.getValueType().getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
KnownZero, DAG, Depth, ResolveKnownElts);
/// Returns the scalar element that will make up the i'th
/// element of the result of the vector shuffle.
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
SelectionDAG &DAG, unsigned Depth) {
if (Depth >= SelectionDAG::MaxRecursionDepth)
return SDValue(); // Limit search depth.
EVT VT = Op.getValueType();
unsigned Opcode = Op.getOpcode();
unsigned NumElems = VT.getVectorNumElements();
// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
int Elt = SV->getMaskElt(Index);
if (Elt < 0)
return DAG.getUNDEF(VT.getVectorElementType());
SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
MVT ShufVT = VT.getSimpleVT();
MVT ShufSVT = ShufVT.getVectorElementType();
int NumElems = (int)ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 16> ShuffleOps;
bool IsUnary;
if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
ShuffleMask, IsUnary))
return SDValue();
int Elt = ShuffleMask[Index];
if (Elt == SM_SentinelZero)
return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
if (Elt == SM_SentinelUndef)
return DAG.getUNDEF(ShufSVT);
assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
// Recurse into insert_subvector base/sub vector to find scalars.
if (Opcode == ISD::INSERT_SUBVECTOR) {
SDValue Vec = Op.getOperand(0);
SDValue Sub = Op.getOperand(1);
uint64_t SubIdx = Op.getConstantOperandVal(2);
unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
// Recurse into concat_vectors sub vector to find scalars.
if (Opcode == ISD::CONCAT_VECTORS) {
EVT SubVT = Op.getOperand(0).getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
uint64_t SubIdx = Index / NumSubElts;
uint64_t SubElt = Index % NumSubElts;
return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
// Recurse into extract_subvector src vector to find scalars.
SDValue Src = Op.getOperand(0);
uint64_t SrcIdx = Op.getConstantOperandVal(1);
return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
// We only peek through bitcasts of the same vector width.
if (Opcode == ISD::BITCAST) {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
return SDValue();
// Actual nodes that may contain scalar elements
// For insert_vector_elt - either return the index matching scalar or recurse
// into the base vector.
if (Opcode == ISD::INSERT_VECTOR_ELT &&
isa<ConstantSDNode>(Op.getOperand(2))) {
if (Op.getConstantOperandAPInt(2) == Index)
return Op.getOperand(1);
return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
if (Opcode == ISD::SCALAR_TO_VECTOR)
return (Index == 0) ? Op.getOperand(0)
: DAG.getUNDEF(VT.getVectorElementType());
if (Opcode == ISD::BUILD_VECTOR)
return Op.getOperand(Index);
return SDValue();
// Use PINSRB/PINSRW/PINSRD to create a build vector.
static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
"Illegal vector insertion");
SDLoc dl(Op);
SDValue V;
bool First = true;
for (unsigned i = 0; i < NumElts; ++i) {
bool IsNonZero = (NonZeros & (1 << i)) != 0;
if (!IsNonZero)
// If the build vector contains zeros or our first insertion is not the
// first index then insert into zero vector to break any register
// dependency else use SCALAR_TO_VECTOR.
if (First) {
First = false;
if (NumZero || 0 != i)
V = getZeroVector(VT, Subtarget, DAG, dl);
else {
assert(0 == i && "Expected insertion into zero-index");
V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
V = DAG.getBitcast(VT, V);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
DAG.getIntPtrConstant(i, dl));
return V;
/// Custom lower build_vector of v16i8.
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (NumNonZero > 8 && !Subtarget.hasSSE41())
return SDValue();
// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget.hasSSE41())
return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
SDLoc dl(Op);
SDValue V;
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; i += 2) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
if (!ThisIsNonZero && !NextIsNonZero)
// FIXME: Investigate combining the first 4 bytes as a i32 instead.
SDValue Elt;
if (ThisIsNonZero) {
if (NumZero || NextIsNonZero)
Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
if (NextIsNonZero) {
SDValue NextElt = Op.getOperand(i + 1);
if (i == 0 && NumZero)
NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
DAG.getConstant(8, dl, MVT::i8));
if (ThisIsNonZero)
Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
Elt = NextElt;
// If our first insertion is not the first index or zeros are needed, then
// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
// elements undefined).
if (!V) {
if (i != 0 || NumZero)
V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
else {
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
V = DAG.getBitcast(MVT::v8i16, V);
Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
DAG.getIntPtrConstant(i / 2, dl));
return DAG.getBitcast(MVT::v16i8, V);
/// Custom lower build_vector of v8i16.
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (NumNonZero > 4 && !Subtarget.hasSSE41())
return SDValue();
// Use PINSRW to insert each byte directly.
return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
/// Custom lower build_vector of v4i32 or v4f32.
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// If this is a splat of a pair of elements, use MOVDDUP (unless the target
// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
// Because we're creating a less complicated build vector here, we may enable
// further folding of the MOVDDUP via shuffle transforms.
if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
Op.getOperand(0) == Op.getOperand(2) &&
Op.getOperand(1) == Op.getOperand(3) &&
Op.getOperand(0) != Op.getOperand(1)) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
// Create a new build vector with the first 2 elements followed by undef
// padding, bitcast to v2f64, duplicate, and bitcast back.
SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
return DAG.getBitcast(VT, Dup);
// Find all zeroable elements.
std::bitset<4> Zeroable, Undefs;
for (int i = 0; i < 4; ++i) {
SDValue Elt = Op.getOperand(i);
Undefs[i] = Elt.isUndef();
Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
assert(Zeroable.size() - Zeroable.count() > 1 &&
"We expect at least two non-zero elements!");
// We only know how to deal with build_vector nodes where elements are either
// zeroable or extract_vector_elt with constant index.
SDValue FirstNonZero;
unsigned FirstNonZeroIdx;
for (unsigned i = 0; i < 4; ++i) {
if (Zeroable[i])
SDValue Elt = Op.getOperand(i);
if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
return SDValue();
// Make sure that this node is extracting from a 128-bit vector.
MVT VT = Elt.getOperand(0).getSimpleValueType();
if (!VT.is128BitVector())
return SDValue();
if (!FirstNonZero.getNode()) {
FirstNonZero = Elt;
FirstNonZeroIdx = i;
assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
SDValue V1 = FirstNonZero.getOperand(0);
MVT VT = V1.getSimpleValueType();
// See if this build_vector can be lowered as a blend with zero.
SDValue Elt;
unsigned EltMaskIdx, EltIdx;
int Mask[4];
for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
if (Zeroable[EltIdx]) {
// The zero vector will be on the right hand side.
Mask[EltIdx] = EltIdx+4;
Elt = Op->getOperand(EltIdx);
// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
EltMaskIdx = Elt.getConstantOperandVal(1);
if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
Mask[EltIdx] = EltIdx;
if (EltIdx == 4) {
// Let the shuffle legalizer deal with blend operations.
SDValue VZeroOrUndef = (Zeroable == Undefs)
: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
if (V1.getSimpleValueType() != VT)
V1 = DAG.getBitcast(VT, V1);
return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
// See if we can lower this build_vector to a INSERTPS.
if (!Subtarget.hasSSE41())
return SDValue();
SDValue V2 = Elt.getOperand(0);
if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
V1 = SDValue();
bool CanFold = true;
for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
if (Zeroable[i])
SDValue Current = Op->getOperand(i);
SDValue SrcVector = Current->getOperand(0);
if (!V1.getNode())
V1 = SrcVector;
CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
if (!CanFold)
return SDValue();
assert(V1.getNode() && "Expected at least two non-zero elements!");
if (V1.getSimpleValueType() != MVT::v4f32)
V1 = DAG.getBitcast(MVT::v4f32, V1);
if (V2.getSimpleValueType() != MVT::v4f32)
V2 = DAG.getBitcast(MVT::v4f32, V2);
// Ok, we can emit an INSERTPS instruction.
unsigned ZMask = Zeroable.to_ulong();
unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
SDLoc DL(Op);
SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
DAG.getIntPtrConstant(InsertPSMask, DL, true));
return DAG.getBitcast(VT, Result);
/// Return a vector logical shift node.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
SelectionDAG &DAG, const TargetLowering &TLI,
const SDLoc &dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
MVT ShVT = MVT::v16i8;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getBitcast(ShVT, SrcOp);
assert(NumBits % 8 == 0 && "Only support byte sized shifts");
SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
SelectionDAG &DAG) {
// Check if the scalar load can be widened into a vector load. And if
// the address is "base + cst" see if the cst can be "absorbed" into
// the shuffle mask.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
SDValue Ptr = LD->getBasePtr();
if (!ISD::isNormalLoad(LD) || !LD->isSimple())
return SDValue();
EVT PVT = LD->getValueType(0);
if (PVT != MVT::i32 && PVT != MVT::f32)
return SDValue();
int FI = -1;
int64_t Offset = 0;
if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
FI = FINode->getIndex();
Offset = 0;
} else if (DAG.isBaseWithConstantOffset(Ptr) &&
isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
Offset = Ptr.getConstantOperandVal(1);
Ptr = Ptr.getOperand(0);
} else {
return SDValue();
// FIXME: 256-bit vector instructions don't require a strict alignment,
// improve this code to support it better.
Align RequiredAlign(VT.getSizeInBits() / 8);
SDValue Chain = LD->getChain();
// Make sure the stack object alignment is at least 16 or 32.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
if (!InferredAlign || *InferredAlign < RequiredAlign) {
if (MFI.isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
// If someone *really* cares about this. That's the way to implement it.
return SDValue();
} else {
MFI.setObjectAlignment(FI, RequiredAlign);
// (Offset % 16 or 32) must be multiple of 4. Then address is then
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
if ((Offset % RequiredAlign.value()) & 3)
return SDValue();
int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
if (StartOffset) {
SDLoc DL(Ptr);
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
int EltNo = (Offset - StartOffset) >> 2;
unsigned NumElems = VT.getVectorNumElements();
EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
SmallVector<int, 8> Mask(NumElems, EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
return SDValue();
// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
if (ISD::isNON_EXTLoad(Elt.getNode())) {
auto *BaseLd = cast<LoadSDNode>(Elt);
if (!BaseLd->isSimple())
return false;
Ld = BaseLd;
ByteOffset = 0;
return true;
switch (Elt.getOpcode()) {
return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
case ISD::SRL:
if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
uint64_t Idx = IdxC->getZExtValue();
if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
ByteOffset += Idx / 8;
return true;
if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
SDValue Src = Elt.getOperand(0);
unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
findEltLoadSrc(Src, Ld, ByteOffset)) {
uint64_t Idx = IdxC->getZExtValue();
ByteOffset += Idx * (SrcSizeInBits / 8);
return true;
return false;
/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
/// elements can be replaced by a single large load which has the same value as
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
if ((VT.getScalarSizeInBits() % 8) != 0)
return SDValue();
unsigned NumElems = Elts.size();
int LastLoadedElt = -1;
APInt LoadMask = APInt::getNullValue(NumElems);
APInt ZeroMask = APInt::getNullValue(NumElems);
APInt UndefMask = APInt::getNullValue(NumElems);
SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (!Elt.getNode())
return SDValue();
if (Elt.isUndef()) {
if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
// Each loaded element must be the correct fractional portion of the
// requested vector load.
unsigned EltSizeInBits = Elt.getValueSizeInBits();
if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
return SDValue();
if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
return SDValue();
unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
return SDValue();
LastLoadedElt = i;
assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks");
// Handle Special Cases - all undef or undef/zero.
if (UndefMask.countPopulation() == NumElems)
return DAG.getUNDEF(VT);
// FIXME: Should we return this as a BUILD_VECTOR instead?
if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
return VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
int FirstLoadedElt = LoadMask.countTrailingZeros();
SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
EVT EltBaseVT = EltBase.getValueType();
assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
"Register/Memory size mismatch");
LoadSDNode *LDBase = Loads[FirstLoadedElt];
assert(LDBase && "Did not find base load for merging consecutive loads");
unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
unsigned BaseSizeInBytes = BaseSizeInBits / 8;
int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
// TODO: Support offsetting the base load.
if (ByteOffsets[FirstLoadedElt] != 0)
return SDValue();
// Check to see if the element's load is consecutive to the base load
// or offset from a previous (already checked) load.
auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
LoadSDNode *Ld = Loads[EltIdx];
int64_t ByteOffset = ByteOffsets[EltIdx];
if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
EltIdx - FirstLoadedElt);
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
// an additional shuffle stage to clear the ZERO elements.
bool IsConsecutiveLoad = true;
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
if (!CheckConsecutiveLoad(LDBase, i)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
} else if (ZeroMask[i]) {
IsConsecutiveLoad = false;
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(LDBase->isSimple() &&
"Cannot merge volatile or atomic loads.");
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, NewLd);
return NewLd;
// Check if the base load is entirely dereferenceable.
bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
// LOAD - all consecutive load/undefs (must start/end with a load or be
// entirely dereferenceable). If we have found an entire vector of loads and
// undefs, then return a large load of the entire vector width starting at the
// base pointer. If the vector contains zeros, then attempt to shuffle those
// elements.
if (FirstLoadedElt == 0 &&
(LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
// Don't create 256-bit non-temporal aligned loads without AVX2 as these
// will lower to regular temporal loads and use the cache.
if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
VT.is256BitVector() && !Subtarget.hasInt256())
return SDValue();
if (NumElems == 1)
return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
if (!ZeroMask)
return CreateLoad(VT, LDBase);
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
if (!isAfterLegalize && VT.isVector()) {
unsigned NumMaskElts = VT.getVectorNumElements();
if ((NumMaskElts % NumElems) == 0) {
unsigned Scale = NumMaskElts / NumElems;
SmallVector<int, 4> ClearMask(NumMaskElts, -1);
for (unsigned i = 0; i < NumElems; ++i) {
if (UndefMask[i])
int Offset = ZeroMask[i] ? NumMaskElts : 0;
for (unsigned j = 0; j != Scale; ++j)
ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
SDValue V = CreateLoad(VT, LDBase);
SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
: DAG.getConstantFP(0.0, DL, VT);
return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
// If the upper half of a ymm/zmm load is undef then just load the lower half.
if (VT.is256BitVector() || VT.is512BitVector()) {
unsigned HalfNumElems = NumElems / 2;
if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
EVT HalfVT =
EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
SDValue HalfLD =
EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
DAG, Subtarget, isAfterLegalize);
if (HalfLD)
HalfLD, DAG.getIntPtrConstant(0, DL));
// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
(LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
: MVT::getIntegerVT(LoadSizeInBits);
MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
// Allow v4f32 on SSE1 only targets.
// FIXME: Add more isel patterns so we can just use VT directly.
if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
VecVT = MVT::v4f32;
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode = DAG.getMemIntrinsicNode(
X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, ResNode);
return DAG.getBitcast(VT, ResNode);
// BROADCAST - match the smallest possible repetition pattern, load that
// scalar/subvector element and then broadcast to the entire vector.
if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
unsigned RepeatSize = SubElems * BaseSizeInBits;
unsigned ScalarSize = std::min(RepeatSize, 64u);
if (!Subtarget.hasAVX2() && ScalarSize < 32)
bool Match = true;
SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
for (unsigned i = 0; i != NumElems && Match; ++i) {
if (!LoadMask[i])
SDValue Elt = peekThroughBitcasts(Elts[i]);
if (RepeatedLoads[i % SubElems].isUndef())
RepeatedLoads[i % SubElems] = Elt;
Match &= (RepeatedLoads[i % SubElems] == Elt);
// We must have loads at both ends of the repetition.
Match &= !RepeatedLoads.front().isUndef();
Match &= !RepeatedLoads.back().isUndef();
if (!Match)
EVT RepeatVT =
VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
: EVT::getFloatingPointVT(ScalarSize);
if (RepeatSize > ScalarSize)
RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
RepeatSize / ScalarSize);
EVT BroadcastVT =
EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
VT.getSizeInBits() / ScalarSize);
if (TLI.isTypeLegal(BroadcastVT)) {
if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
return DAG.getBitcast(VT, Broadcast);
return SDValue();
// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
// are consecutive, non-overlapping, and in the right order.
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
SmallVector<SDValue, 64> Elts;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
return SDValue();
assert(Elts.size() == VT.getVectorNumElements());
return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
unsigned NumElm = SplatBitSize / ScalarSize;
SmallVector<Constant *, 32> ConstantVec;
for (unsigned i = 0; i < NumElm; i++) {
APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
Constant *Const;
if (VT.isFloatingPoint()) {
if (ScalarSize == 32) {
Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
} else {
assert(ScalarSize == 64 && "Unsupported floating point scalar size");
Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
} else
Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
static bool isFoldableUseOfShuffle(SDNode *N) {
for (auto *U : N->uses()) {
unsigned Opc = U->getOpcode();
// VPERMV/VPERMV3 shuffles can never fold their index operands.
if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
return false;
if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
return false;
if (isTargetShuffle(Opc))
return true;
if (Opc == ISD::BITCAST) // Ignore bitcasts
return isFoldableUseOfShuffle(U);
if (N->hasOneUse())
return true;
return false;
// Check if the current node of build vector is a zero extended vector.
// // If so, return the value extended.
// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
// // NumElt - return the number of zero extended identical values.
// // EltType - return the type of the value include the zero extend.
static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
unsigned &NumElt, MVT &EltType) {
SDValue ExtValue = Op->getOperand(0);
unsigned NumElts = Op->getNumOperands();
unsigned Delta = NumElts;
for (unsigned i = 1; i < NumElts; i++) {
if (Op->getOperand(i) == ExtValue) {
Delta = i;
if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
return SDValue();
if (!isPowerOf2_32(Delta) || Delta == 1)
return SDValue();
for (unsigned i = Delta; i < NumElts; i++) {
if (i % Delta == 0) {
if (Op->getOperand(i) != ExtValue)
return SDValue();
} else if (!(isNullConstant(Op->getOperand(i)) ||
return SDValue();
unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
unsigned ExtVTSize = EltSize * Delta;
EltType = MVT::getIntegerVT(ExtVTSize);
NumElt = NumElts / Delta;
return ExtValue;
/// Attempt to use the vbroadcast instruction to generate a splat value
/// from a splat BUILD_VECTOR which uses:
/// a. A single scalar load, or a constant.
/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
/// The VBROADCAST node is returned when a pattern is found,
/// or SDValue() otherwise.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// VBROADCAST requires AVX.
// TODO: Splats could be generated for non-AVX CPUs using SSE
// instructions, but there's less potential gain for only 128-bit vectors.
if (!Subtarget.hasAVX())
return SDValue();
MVT VT = BVOp->getSimpleValueType(0);
SDLoc dl(BVOp);
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
BitVector UndefElements;
SDValue Ld = BVOp->getSplatValue(&UndefElements);
// Attempt to use VBROADCASTM
// From this pattern:
// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
// b. t1 = (build_vector t0 t0)
// Create (VBROADCASTM v2i1 X)
if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
MVT EltType = VT.getScalarType();
unsigned NumElts = VT.getVectorNumElements();
SDValue BOperand;
SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
if (ZeroExtended)
BOperand = ZeroExtended.getOperand(0);
BOperand = Ld.getOperand(0).getOperand(0);
MVT MaskVT = BOperand.getSimpleValueType();
if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
SDValue Brdcst =
MVT::getVectorVT(EltType, NumElts), BOperand);
return DAG.getBitcast(VT, Brdcst);
unsigned NumElts = VT.getVectorNumElements();
unsigned NumUndefElts = UndefElements.count();
if (!Ld || (NumElts - NumUndefElts) <= 1) {
APInt SplatValue, Undef;
unsigned SplatBitSize;
bool HasUndef;
// Check if this is a repeated constant pattern suitable for broadcasting.
if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
SplatBitSize > VT.getScalarSizeInBits() &&
SplatBitSize < VT.getSizeInBits()) {
// Avoid replacing with broadcast when it's a use of a shuffle
// instruction to preserve the present custom lowering of shuffles.
if (isFoldableUseOfShuffle(BVOp))
return SDValue();
// replace BUILD_VECTOR with broadcast of the repeated constants.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
LLVMContext *Ctx = DAG.getContext();
MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
if (Subtarget.hasAVX()) {
if (SplatBitSize == 32 || SplatBitSize == 64 ||
(SplatBitSize < 32 && Subtarget.hasAVX2())) {
// Splatted value can fit in one INTEGER constant in constant pool.
// Load the constant and broadcast it.
MVT CVT = MVT::getIntegerVT(SplatBitSize);
Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
SDVTList Tys =
DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
SDValue Ops[] = {DAG.getEntryNode(), CP};
MachinePointerInfo MPI =
SDValue Brdcst = DAG.getMemIntrinsicNode(
X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
return DAG.getBitcast(VT, Brdcst);
if (SplatBitSize > 64) {
// Load the vector of constants and broadcast it.
MVT CVT = VT.getScalarType();
Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
SDValue VCP = DAG.getConstantPool(VecC, PVT);
unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
Ld = DAG.getLoad(
MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
return DAG.getBitcast(VT, Brdcst);
// If we are moving a scalar into a vector (Ld must be set and all elements
// but 1 are undef) and that operation is not obviously supported by
// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
// That's better than general shuffling and may eliminate a load to GPR and
// move from scalar to vector register.
if (!Ld || NumElts - NumUndefElts != 1)
return SDValue();
unsigned ScalarSize = Ld.getValueSizeInBits();
if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
return SDValue();
bool ConstSplatVal =
(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
bool IsLoad = ISD::isNormalLoad(Ld.getNode());
// Make sure that all of the users of a non-constant load are from the
// FIXME: Is the use count needed for non-constant, non-load case?
if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
return SDValue();
unsigned ScalarSize = Ld.getValueSizeInBits();
bool IsGE256 = (VT.getSizeInBits() >= 256);
// When optimizing for size, generate up to 5 extra bytes for a broadcast
// instruction to save 8 or more bytes of constant pool data.
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
bool OptForSize = DAG.shouldOptForSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
// On Sandybridge (no AVX2), it is still better to load a constant vector
// from the constant pool and not to broadcast it from a scalar.
// But override that restriction when optimizing for size.
// TODO: Check if splatting is recommended for other AVX-capable CPUs.
if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
EVT CVT = Ld.getValueType();
assert(!CVT.isVector() && "Must not broadcast a vector type");
// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
// For size optimization, also splat v2f64 and v2i64, and for size opt
// with AVX2, also splat i8 and i16.
// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
C = CF->getConstantFPValue();
assert(C && "Invalid constant type");
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CP =
DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {DAG.getEntryNode(), CP};
MachinePointerInfo MPI =
return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
MPI, Alignment, MachineMemOperand::MOLoad);
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget.hasInt256() &&
(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The scalar source must be a normal load.
if (!IsLoad)
return SDValue();
// Make sure the non-chain result is only used by this build vector.
if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
return SDValue();
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
(Subtarget.hasVLX() && ScalarSize == 64)) {
auto *LN = cast<LoadSDNode>(Ld);
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue BCast =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
return BCast;
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
// double since there is no vbroadcastsd xmm
if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
(ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
auto *LN = cast<LoadSDNode>(Ld);
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue BCast =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
return BCast;
// Unsupported broadcast.
return SDValue();
/// For an EXTRACT_VECTOR_ELT with a constant index return the real
/// underlying vector and index.
/// Modifies \p ExtractedFromVec to the real vector and returns the real
/// index.
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
SDValue ExtIdx) {
int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
return Idx;
// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
// lowered this:
// (extract_vector_elt (v8f32 %1), Constant<6>)
// to:
// (extract_vector_elt (vector_shuffle<2,u,u,u>
// (extract_subvector (v8f32 %0), Constant<4>),
// undef)
// Constant<0>)
// In this case the vector is the extract_subvector expression and the index
// is 2, as specified by the shuffle.
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
SDValue ShuffleVec = SVOp->getOperand(0);
MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
assert(ShuffleVecVT.getVectorElementType() ==
int ShuffleIdx = SVOp->getMaskElt(Idx);
if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
ExtractedFromVec = ShuffleVec;
return ShuffleIdx;
return Idx;
static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
// Skip if insert_vec_elt is not supported.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
return SDValue();
SDLoc DL(Op);
unsigned NumElems = Op.getNumOperands();
SDValue VecIn1;
SDValue VecIn2;
SmallVector<unsigned, 4> InsertIndices;
SmallVector<int, 8> Mask(NumElems, -1);
for (unsigned i = 0; i != NumElems; ++i) {
unsigned Opc = Op.getOperand(i).getOpcode();
if (Opc == ISD::UNDEF)
// Quit if more than 1 elements need inserting.
if (InsertIndices.size() > 1)
return SDValue();
SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
SDValue ExtIdx = Op.getOperand(i).getOperand(1);
// Quit if non-constant index.
if (!isa<ConstantSDNode>(ExtIdx))
return SDValue();
int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
// Quit if extracted from vector of different type.
if (ExtractedFromVec.getValueType() != VT)
return SDValue();
if (!VecIn1.getNode())
VecIn1 = ExtractedFromVec;
else if (VecIn1 != ExtractedFromVec) {
if (!VecIn2.getNode())
VecIn2 = ExtractedFromVec;
else if (VecIn2 != ExtractedFromVec)
// Quit if more than 2 vectors to shuffle
return SDValue();
if (ExtractedFromVec == VecIn1)
Mask[i] = Idx;
else if (ExtractedFromVec == VecIn2)
Mask[i] = Idx + NumElems;
if (!VecIn1.getNode())
return SDValue();
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
for (unsigned Idx : InsertIndices)
NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
DAG.getIntPtrConstant(Idx, DL));
return NV;
// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
assert((VT.getVectorElementType() == MVT::i1) &&
"Unexpected type in LowerBUILD_VECTORvXi1!");
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
return Op;
uint64_t Immediate = 0;
SmallVector<unsigned, 16> NonConstIdx;
bool IsSplat = true;
bool HasConstElts = false;
int SplatIdx = -1;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (In.isUndef())
if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
Immediate |= (InC->getZExtValue() & 0x1) << idx;
HasConstElts = true;
} else {
if (SplatIdx < 0)
SplatIdx = idx;
else if (In != Op.getOperand(SplatIdx))
IsSplat = false;
// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
if (IsSplat) {
// The build_vector allows the scalar element to be larger than the vector
// element type. We need to mask it to use as a condition unless we know
// the upper bits are zero.
// FIXME: Use computeKnownBits instead of checking specific opcode?
SDValue Cond = Op.getOperand(SplatIdx);
assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
if (Cond.getOpcode() != ISD::SETCC)
Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
DAG.getConstant(1, dl, MVT::i8));
// Perform the select in the scalar domain so we can use cmov.
if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
DAG.getAllOnesConstant(dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
Select = DAG.getBitcast(MVT::v32i1, Select);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
} else {
MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
DAG.getAllOnesConstant(dl, ImmVT),
DAG.getConstant(0, dl, ImmVT));
MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
Select = DAG.getBitcast(VecVT, Select);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
DAG.getIntPtrConstant(0, dl));
// insert elements one by one
SDValue DstVec;
if (HasConstElts) {
if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
} else {
MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
DstVec = DAG.getBitcast(VecVT, Imm);
DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
DAG.getIntPtrConstant(0, dl));
} else
DstVec = DAG.getUNDEF(VT);
for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
unsigned InsertIdx = NonConstIdx[i];
DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
DAG.getIntPtrConstant(InsertIdx, dl));
return DstVec;
/// This is a helper function of LowerToHorizontalOp().
/// This function checks that the build_vector \p N in input implements a
/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
/// may not match the layout of an x86 256-bit horizontal instruction.
/// In other words, if this returns true, then some extraction/insertion will
/// be required to produce a valid horizontal instruction.
/// Parameter \p Opcode defines the kind of horizontal operation to match.
/// For example, if \p Opcode is equal to ISD::ADD, then this function
/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
/// is equal to ISD::SUB, then this function checks if this is a horizontal
/// arithmetic sub.
/// This function only analyzes elements of \p N whose indices are
/// in range [BaseIdx, LastIdx).
/// TODO: This function was originally used to match both real and fake partial
/// horizontal operations, but the index-matching logic is incorrect for that.
/// See the corrected implementation in isHopBuildVector(). Can we reduce this
/// code because it is only used for partial h-op matching now?
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
SelectionDAG &DAG,
unsigned BaseIdx, unsigned LastIdx,
SDValue &V0, SDValue &V1) {
EVT VT = N->getValueType(0);
assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
"Invalid Vector in input!");
bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
bool CanFold = true;
unsigned ExpectedVExtractIdx = BaseIdx;
unsigned NumElts = LastIdx - BaseIdx;
V0 = DAG.getUNDEF(VT);
V1 = DAG.getUNDEF(VT);
// Check if N implements a horizontal binop.
for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
SDValue Op = N->getOperand(i + BaseIdx);
// Skip UNDEFs.
if (Op->isUndef()) {
// Update the expected vector extract index.
if (i * 2 == NumElts)
ExpectedVExtractIdx = BaseIdx;
ExpectedVExtractIdx += 2;
CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
if (!CanFold)
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Try to match the following pattern:
// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op0.getOperand(0) == Op1.getOperand(0) &&
isa<ConstantSDNode>(Op0.getOperand(1)) &&
if (!CanFold)
unsigned I0 = Op0.getConstantOperandVal(1);
unsigned I1 = Op1.getConstantOperandVal(1);
if (i * 2 < NumElts) {
if (V0.isUndef()) {
V0 = Op0.getOperand(0);
if (V0.getValueType() != VT)
return false;
} else {
if (V1.isUndef()) {
V1 = Op0.getOperand(0);
if (V1.getValueType() != VT)
return false;
if (i * 2 == NumElts)
ExpectedVExtractIdx = BaseIdx;
SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
if (I0 == ExpectedVExtractIdx)
CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
else if (IsCommutable && I1 == ExpectedVExtractIdx) {
// Try to match the following dag sequence:
// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
} else
CanFold = false;
ExpectedVExtractIdx += 2;
return CanFold;
/// Emit a sequence of two 128-bit horizontal add/sub followed by
/// a concat_vector.
/// This is a helper function of LowerToHorizontalOp().
/// This function expects two 256-bit vectors called V0 and V1.
/// At first, each vector is split into two separate 128-bit vectors.
/// Then, the resulting 128-bit vectors are used to implement two
/// horizontal binary operations.
/// The kind of horizontal binary operation is defined by \p X86Opcode.
/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
/// the two new horizontal binop.
/// When Mode is set, the first horizontal binop dag node would take as input
/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
/// horizontal binop dag node would take as input the lower 128-bit of V1
/// and the upper 128-bit of V1.
/// Example:
/// HADD V0_LO, V0_HI
/// HADD V1_LO, V1_HI
/// Otherwise, the first horizontal binop dag node takes as input the lower
/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
/// Example:
/// HADD V0_LO, V1_LO
/// HADD V0_HI, V1_HI
/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
/// the upper 128-bits of the result.
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
const SDLoc &DL, SelectionDAG &DAG,
unsigned X86Opcode, bool Mode,
bool isUndefLO, bool isUndefHI) {
MVT VT = V0.getSimpleValueType();
assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
"Invalid nodes in input!");
unsigned NumElts = VT.getVectorNumElements();
SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
MVT NewVT = V0_LO.getSimpleValueType();
SDValue LO = DAG.getUNDEF(NewVT);
SDValue HI = DAG.getUNDEF(NewVT);
if (Mode) {
// Don't emit a horizontal binop if the result is expected to be UNDEF.
if (!isUndefLO && !V0->isUndef())
LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
if (!isUndefHI && !V1->isUndef())
HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
} else {
// Don't emit a horizontal binop if the result is expected to be UNDEF.
if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
/// Returns true iff \p BV builds a vector with the result equivalent to
/// the result of ADDSUB/SUBADD operation.
/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
/// \p Opnd0 and \p Opnd1.
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
SDValue &Opnd0, SDValue &Opnd1,
unsigned &NumExtracts,
bool &IsSubAdd) {
MVT VT = BV->getSimpleValueType(0);
if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
return false;
unsigned NumElts = VT.getVectorNumElements();
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
NumExtracts = 0;
// Odd-numbered elements in the input build vector are obtained from
// adding/subtracting two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
// subtracting/adding two integer/float elements.
unsigned Opc[2] = {0, 0};
for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Op = BV->getOperand(i);
// Skip 'undef' values.
unsigned Opcode = Op.getOpcode();
if (Opcode == ISD::UNDEF)
// Early exit if we found an unexpected opcode.
if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
return false;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Try to match the following pattern:
// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
// Early exit if we cannot match that sequence.
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
return false;
unsigned I0 = Op0.getConstantOperandVal(1);
if (I0 != i)
return false;
// We found a valid add/sub node, make sure its the same opcode as previous
// elements for this parity.
if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
return false;
Opc[i % 2] = Opcode;
// Update InVec0 and InVec1.
if (InVec0.isUndef()) {
InVec0 = Op0.getOperand(0);
if (InVec0.getSimpleValueType() != VT)
return false;
if (InVec1.isUndef()) {
InVec1 = Op1.getOperand(0);
if (InVec1.getSimpleValueType() != VT)
return false;
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
if (InVec0 != Op0.getOperand(0)) {
if (Opcode == ISD::FSUB)
return false;
// FADD is commutable. Try to commute the operands
// and then test again.
std::swap(Op0, Op1);
if (InVec0 != Op0.getOperand(0))
return false;
if (InVec1 != Op1.getOperand(0))
return false;
// Increment the number of extractions done.
// Ensure we have found an opcode for both parities and that they are
// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
// inputs are undef.
if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
InVec0.isUndef() || InVec1.isUndef())
return false;
IsSubAdd = Opc[0] == ISD::FADD;
Opnd0 = InVec0;
Opnd1 = InVec1;
return true;
/// Returns true if is possible to fold MUL and an idiom that has already been
/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
/// Prior to calling this function it should be known that there is some
/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
/// before replacement of such SDNode with ADDSUB operation. Thus the number
/// of \p Opnd0 uses is expected to be equal to 2.
/// For example, this function may be called for the following IR:
/// %AB = fmul fast <2 x double> %A, %B
/// %Sub = fsub fast <2 x double> %AB, %C
/// %Add = fadd fast <2 x double> %AB, %C
/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
/// <2 x i32> <i32 0, i32 3>
/// There is a def for %Addsub here, which potentially can be replaced by
/// X86ISD::ADDSUB operation:
/// %Addsub = X86ISD::ADDSUB %AB, %C
/// and such ADDSUB can further be replaced with FMADDSUB:
/// %Addsub = FMADDSUB %A, %B, %C.
/// The main reason why this method is called before the replacement of the
/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
/// FMADDSUB is.
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
unsigned ExpectedUses) {
if (Opnd0.getOpcode() != ISD::FMUL ||
!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
return false;
// FIXME: These checks must match the similar ones in
// DAGCombiner::visitFADDForFMACombine. It would be good to have one
// function that would answer if it is Ok to fuse MUL + ADD to FMADD
const TargetOptions &Options = DAG.getTarget().Options;
bool AllowFusion =
(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
if (!AllowFusion)
return false;
Opnd2 = Opnd1;
Opnd1 = Opnd0.getOperand(1);
Opnd0 = Opnd0.getOperand(0);
return true;
/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
/// X86ISD::FMSUBADD node.
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Opnd0, Opnd1;
unsigned NumExtracts;
bool IsSubAdd;
if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
return SDValue();
MVT VT = BV->getSimpleValueType(0);
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
// We only support ADDSUB.
if (IsSubAdd)
return SDValue();
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
// the ADDSUB idiom has been successfully recognized. There are no known
// X86 targets with 512-bit ADDSUB instructions!
// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
// recognition.
if (VT.is512BitVector())
return SDValue();
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
unsigned &HOpcode, SDValue &V0, SDValue &V1) {
// Initialize outputs to known values.
MVT VT = BV->getSimpleValueType(0);
V0 = DAG.getUNDEF(VT);
V1 = DAG.getUNDEF(VT);
// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
// half of the result is calculated independently from the 128-bit halves of
// the inputs, so that makes the index-checking logic below more complicated.
unsigned NumElts = VT.getVectorNumElements();
unsigned GenericOpcode = ISD::DELETED_NODE;
unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
for (unsigned i = 0; i != Num128BitChunks; ++i) {
for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
// Ignore undef elements.
SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
if (Op.isUndef())
// If there's an opcode mismatch, we're done.
if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
return false;
// Initialize horizontal opcode.
if (HOpcode == ISD::DELETED_NODE) {
GenericOpcode = Op.getOpcode();
switch (GenericOpcode) {
case ISD::ADD: HOpcode = X86ISD::HADD; break;
case ISD::SUB: HOpcode = X86ISD::HSUB; break;
case ISD::FADD: HOpcode = X86ISD::FHADD; break;
case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
default: return false;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op0.getOperand(0) != Op1.getOperand(0) ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
return false;
// The source vector is chosen based on which 64-bit half of the
// destination vector is being calculated.
if (j < NumEltsIn64Bits) {
if (V0.isUndef())
V0 = Op0.getOperand(0);
} else {
if (V1.isUndef())
V1 = Op0.getOperand(0);
SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
if (SourceVec != Op0.getOperand(0))
return false;
// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
unsigned ExpectedIndex = i * NumEltsIn128Bits +
(j % NumEltsIn64Bits) * 2;
if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
// If this is not a commutative op, this does not match.
if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
return false;
// Addition is commutative, so try swapping the extract indexes.
// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
// Extract indexes do not match horizontal requirement.
return false;
// We matched. Opcode and operands are returned by reference as arguments.
return true;
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
SelectionDAG &DAG, unsigned HOpcode,
SDValue V0, SDValue V1) {
// If either input vector is not the same size as the build vector,
// extract/insert the low bits to the correct size.
// This is free (examples: zmm --> xmm, xmm --> ymm).
MVT VT = BV->getSimpleValueType(0);
unsigned Width = VT.getSizeInBits();
if (V0.getValueSizeInBits() > Width)
V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
else if (V0.getValueSizeInBits() < Width)
V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
if (V1.getValueSizeInBits() > Width)
V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
else if (V1.getValueSizeInBits() < Width)
V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
unsigned NumElts = VT.getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
for (unsigned i = 0; i != NumElts; ++i)
if (BV->getOperand(i).isUndef())
// If we don't need the upper xmm, then perform as a xmm hop.
unsigned HalfNumElts = NumElts / 2;
if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
MVT HalfVT = VT.getHalfNumVectorElementsVT();
V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// We need at least 2 non-undef elements to make this worthwhile by default.
unsigned NumNonUndefs =
count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
if (NumNonUndefs < 2)
return SDValue();
// There are 4 sets of horizontal math operations distinguished by type:
// int/FP at 128-bit/256-bit. Each type was introduced with a different
// subtarget feature. Try to match those "native" patterns first.
MVT VT = BV->getSimpleValueType(0);
if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
unsigned HOpcode;
SDValue V0, V1;
if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
// Try harder to match 256-bit ops by using extract/concat.
if (!Subtarget.hasAVX() || !VT.is256BitVector())
return SDValue();
// Count the number of UNDEF operands in the build_vector in input.
unsigned NumElts = VT.getVectorNumElements();
unsigned Half = NumElts / 2;
unsigned NumUndefsLO = 0;
unsigned NumUndefsHI = 0;
for (unsigned i = 0, e = Half; i != e; ++i)
if (BV->getOperand(i)->isUndef())
for (unsigned i = Half, e = NumElts; i != e; ++i)
if (BV->getOperand(i)->isUndef())
SDValue InVec0, InVec1;
if (VT == MVT::v8i32 || VT == MVT::v16i16) {
SDValue InVec2, InVec3;
unsigned X86Opcode;
bool CanFold = true;
if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
InVec3) &&
((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
X86Opcode = X86ISD::HADD;
else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
InVec1) &&
isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
InVec3) &&
((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
X86Opcode = X86ISD::HSUB;
CanFold = false;
if (CanFold) {
// Do not try to expand this build_vector into a pair of horizontal
// add/sub if we can emit a pair of scalar add/sub.
if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
return SDValue();
// Convert this build_vector into a pair of horizontal binops followed by
// a concat vector. We must adjust the outputs from the partial horizontal
// matching calls above to account for undefined vector halves.
SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
bool isUndefLO = NumUndefsLO == Half;
bool isUndefHI = NumUndefsHI == Half;
return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
VT == MVT::v16i16) {
unsigned X86Opcode;
if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
X86Opcode = X86ISD::HADD;
else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
X86Opcode = X86ISD::HSUB;
else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
X86Opcode = X86ISD::FHADD;
else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
X86Opcode = X86ISD::FHSUB;
return SDValue();
// Don't try to expand this build_vector into a pair of horizontal add/sub
// if we can simply emit a pair of scalar add/sub.
if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
return SDValue();
// Convert this build_vector into two horizontal add/sub followed by
// a concat vector.
bool isUndefLO = NumUndefsLO == Half;
bool isUndefHI = NumUndefsHI == Half;
return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
isUndefLO, isUndefHI);
return SDValue();
static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG);
/// If a BUILD_VECTOR's source elements all apply the same bit operation and
/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
/// just apply the bit to the vectors.
/// NOTE: Its not in our interest to start make a general purpose vectorizer
/// from this, but enough scalar bit operations are created from the later
/// legalization + scalarization stages to need basic support.
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op->getSimpleValueType(0);
unsigned NumElems = VT.getVectorNumElements();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Check that all elements have the same opcode.
// TODO: Should we allow UNDEFS and if so how many?
unsigned Opcode = Op->getOperand(0).getOpcode();
for (unsigned i = 1; i < NumElems; ++i)
if (Opcode != Op->getOperand(i).getOpcode())
return SDValue();
// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
bool IsShift = false;
switch (Opcode) {
return SDValue();
case ISD::SHL:
case ISD::SRL:
case ISD::SRA:
IsShift = true;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
// Don't do this if the buildvector is a splat - we'd replace one
// constant with an entire vector.
if (Op->getSplatValue())
return SDValue();
if (!TLI.isOperationLegalOrPromote(Opcode, VT))
return SDValue();
SmallVector<SDValue, 4> LHSElts, RHSElts;
for (SDValue Elt : Op->ops()) {
SDValue LHS = Elt.getOperand(0);
SDValue RHS = Elt.getOperand(1);
// We expect the canonicalized RHS operand to be the constant.
if (!isa<ConstantSDNode>(RHS))
return SDValue();
// Extend shift amounts.
if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
if (!IsShift)
return SDValue();
RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
// Limit to shifts by uniform immediates.
// TODO: Only accept vXi8/vXi64 special cases?
// TODO: Permit non-uniform XOP/AVX2/MULLO cases?
if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
return SDValue();
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
if (!IsShift)
return Res;
// Immediately lower the shift to ensure the constant build vector doesn't
// get converted to a constant pool before the shift is lowered.
return LowerShift(Res, Subtarget, DAG);
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
/// functionality to do this, so it's all zeros, all ones, or some derivation
/// that is cheap to calculate.
static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
// Vectors containing all zeros can be matched by pxor and xorps.
if (ISD::isBuildVectorAllZeros(Op.getNode()))
return Op;
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
return Op;
return getOnesVector(VT, DAG, DL);
return SDValue();
/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
/// from a vector of source values and a vector of extraction indices.
/// The vectors might be manipulated to match the type of the permute op.
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT ShuffleVT = VT;
EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
unsigned NumElts = VT.getVectorNumElements();
unsigned SizeInBits = VT.getSizeInBits();
// Adjust IndicesVec to match VT size.
assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
"Illegal variable permute mask size");
if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
NumElts * VT.getScalarSizeInBits());
IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
// Handle SrcVec that don't match VT type.
if (SrcVec.getValueSizeInBits() != SizeInBits) {
if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
// Handle larger SrcVec by treating it as a larger permute.
unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
Subtarget, DAG, SDLoc(IndicesVec));
SDValue NewSrcVec =
createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
if (NewSrcVec)
return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
return SDValue();
} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
// Widen smaller SrcVec to match VT.
SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
} else
return SDValue();
auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
EVT SrcVT = Idx.getValueType();
unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
uint64_t IndexScale = 0;
uint64_t IndexOffset = 0;
// If we're scaling a smaller permute op, then we need to repeat the
// indices, scaling and offsetting them as well.
// e.g. v4i32 -> v16i8 (Scale = 4)
// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
for (uint64_t i = 0; i != Scale; ++i) {
IndexScale |= Scale << (i * NumDstBits);
IndexOffset |= i << (i * NumDstBits);
Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
return Idx;
unsigned Opcode = 0;
switch (VT.SimpleTy) {
case MVT::v16i8:
if (Subtarget.hasSSSE3())
Opcode = X86ISD::PSHUFB;
case MVT::v8i16:
if (Subtarget.hasVLX() && Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasSSSE3()) {
Opcode = X86ISD::PSHUFB;
ShuffleVT = MVT::v16i8;
case MVT::v4f32:
case MVT::v4i32:
if (Subtarget.hasAVX()) {
ShuffleVT = MVT::v4f32;
} else if (Subtarget.hasSSSE3()) {
Opcode = X86ISD::PSHUFB;
ShuffleVT = MVT::v16i8;
case MVT::v2f64:
case MVT::v2i64:
if (Subtarget.hasAVX()) {
// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
ShuffleVT = MVT::v2f64;
} else if (Subtarget.hasSSE41()) {
// SSE41 can compare v2i64 - select between indices 0 and 1.
return DAG.getSelectCC(
DL, IndicesVec,
getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
case MVT::v32i8:
if (Subtarget.hasVLX() && Subtarget.hasVBMI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasXOP()) {
SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
return DAG.getNode(
DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
} else if (Subtarget.hasAVX()) {
SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
// Permute Lo and Hi and then select based on index range.
// This works as SHUFB uses bits[3:0] to permute elements and we don't
// care about the bit[7] as its just an index vector.
SDValue Idx = Ops[2];
EVT VT = Idx.getValueType();
return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
SDValue Ops[] = {LoLo, HiHi, IndicesVec};
return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
case MVT::v16i16:
if (Subtarget.hasVLX() && Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasAVX()) {
// Scale to v32i8 and perform as v32i8.
IndicesVec = ScaleIndices(IndicesVec, 2);
return DAG.getBitcast(
VT, createVariablePermute(
MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
case MVT::v8f32:
case MVT::v8i32:
if (Subtarget.hasAVX2())
Opcode = X86ISD::VPERMV;
else if (Subtarget.hasAVX()) {
SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
{0, 1, 2, 3, 0, 1, 2, 3});
SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
{4, 5, 6, 7, 4, 5, 6, 7});
if (Subtarget.hasXOP())
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPS only uses index bits[0:1] to permute elements.
SDValue Res = DAG.getSelectCC(
DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
return DAG.getBitcast(VT, Res);
case MVT::v4i64:
case MVT::v4f64:
if (Subtarget.hasAVX512()) {
if (!Subtarget.hasVLX()) {
MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
DAG, SDLoc(IndicesVec));
SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
DAG, Subtarget);
return extract256BitVector(Res, 0, DAG, DL);
Opcode = X86ISD::VPERMV;
} else if (Subtarget.hasAVX()) {
SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
SDValue LoLo =
DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
SDValue HiHi =
DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
if (Subtarget.hasXOP())
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPD only uses index bit[1] to permute elements.
SDValue Res = DAG.getSelectCC(
DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
return DAG.getBitcast(VT, Res);
case MVT::v64i8:
if (Subtarget.hasVBMI())
Opcode = X86ISD::VPERMV;
case MVT::v32i16:
if (Subtarget.hasBWI())
Opcode = X86ISD::VPERMV;
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8f64:
case MVT::v8i64:
if (Subtarget.hasAVX512())
Opcode = X86ISD::VPERMV;
if (!Opcode)
return SDValue();
assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
"Illegal variable permute shuffle type");
uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
if (Scale > 1)
IndicesVec = ScaleIndices(IndicesVec, Scale);
EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
SDValue Res = Opcode == X86ISD::VPERMV
? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
return DAG.getBitcast(VT, Res);
// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
// reasoned to be a permutation of a vector by indices in a non-constant vector.
// (build_vector (extract_elt V, (extract_elt I, 0)),
// (extract_elt V, (extract_elt I, 1)),
// ...
// ->
// (vpermv I, V)
// TODO: Handle undefs
// TODO: Utilize pshufb and zero mask blending to support more efficient
// construction of vectors with constant-0 elements.
static SDValue
LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue SrcVec, IndicesVec;
// Check for a match of the permute source vector and permute index elements.
// This is done by checking that the i-th build_vector operand is of the form:
// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
SDValue Op = V.getOperand(Idx);
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// If this is the first extract encountered in V, set the source vector,
// otherwise verify the extract is from the previously defined source
// vector.
if (!SrcVec)
SrcVec = Op.getOperand(0);
else if (SrcVec != Op.getOperand(0))
return SDValue();
SDValue ExtractedIndex = Op->getOperand(1);
// Peek through extends.
if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
ExtractedIndex = ExtractedIndex.getOperand(0);
if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// If this is the first extract from the index vector candidate, set the
// indices vector, otherwise verify the extract is from the previously
// defined indices vector.
if (!IndicesVec)
IndicesVec = ExtractedIndex.getOperand(0);
else if (IndicesVec != ExtractedIndex.getOperand(0))
return SDValue();
auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
if (!PermIdx || PermIdx->getAPIntValue() != Idx)
return SDValue();
SDLoc DL(V);
MVT VT = V.getSimpleValueType();
return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
// Generate vectors for predicate vectors.
if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
return VectorConstant;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
return HorizontalOp;
if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
return Broadcast;
if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
return BitOp;
unsigned EVTBits = EltVT.getSizeInBits();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
uint64_t NonZeros = 0;
bool IsAllConstants = true;
SmallSet<SDValue, 8> Values;
unsigned NumConstants = NumElems;
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
if (Elt.isUndef())
if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
if (X86::isZeroNode(Elt))
else {
assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
NonZeros |= ((uint64_t)1 << i);
// All undef vector. Return an UNDEF. All zero vectors were handled above.
if (NumNonZero == 0)
return DAG.getUNDEF(VT);
// If we are inserting one variable into a vector of non-zero constants, try
// to avoid loading each constant element as a scalar. Load the constants as a
// vector and then insert the variable scalar element. If insertion is not
// supported, fall back to a shuffle to get the scalar blended with the
// constants. Insertion into a zero vector is handled as a special-case
// somewhere below here.
if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
// Create an all-constant vector. The variable element in the old
// build vector is replaced by undef in the constant vector. Save the
// variable scalar element and its index for use in the insertelement.
LLVMContext &Context = *DAG.getContext();
Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
SDValue VarElt;
SDValue InsIndex;
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
if (auto *C = dyn_cast<ConstantSDNode>(Elt))
ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
else if (!Elt.isUndef()) {
assert(!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector");
VarElt = Elt;
InsIndex = DAG.getVectorIdxConstant(i, dl);
Constant *CV = ConstantVector::get(ConstVecOps);
SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
// The constants we just created may not be legal (eg, floating point). We
// must lower the vector right here because we can not guarantee that we'll
// legalize it before loading it. This is also why we could not just create
// a new build vector here. If the build vector contains illegal constants,
// it could get split back up into a series of insert elements.
// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
if (InsertC < NumEltsInLow128Bits)
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
// There's no good way to insert into the high elements of a >128-bit
// vector, so use shuffles to avoid an extract/insert sequence.
assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
SmallVector<int, 8> ShuffleMask;
unsigned NumElts = VT.getVectorNumElements();
for (unsigned i = 0; i != NumElts; ++i)
ShuffleMask.push_back(i == InsertC ? NumElts : i);
SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
// If we have a constant or non-constant insertion into the low element of
// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
// the rest of the elements. This will be matched as movd/movq/movss/movsd
// depending on what the source datatype is.
if (Idx == 0) {
if (NumZero == 0)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
(EltVT == MVT::i64 && Subtarget.is64Bit())) {
assert((VT.is128BitVector() || VT.is256BitVector() ||
VT.is512BitVector()) &&
"Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
// We can't directly insert an i8 or i16 into a vector, so zero extend
// it to i32 first.
if (EltVT == MVT::i16 || EltVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
return DAG.getBitcast(VT, Item);
// Is it a vector logical left shift?
if (NumElems == 2 && Idx == 1 &&
X86::isZeroNode(Op.getOperand(0)) &&
!X86::isZeroNode(Op.getOperand(1))) {
unsigned NumBits = VT.getSizeInBits();
return getVShift(true, VT,
VT, Op.getOperand(1)),
NumBits/2, DAG, *this, dl);
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
return SDValue();
// Otherwise, if this is a vector with i32 or f32 elements, and the element
// is a non-constant being inserted into an element other than the low one,
// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
// movd/movss) to move this into the low element, then shuffle it into
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
// Splat is obviously ok. Let legalizer expand it to a shuffle.
if (Values.size() == 1) {
if (EVTBits == 32) {
// Instead of a shuffle like this:
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
// Check if it's possible to issue this instead.
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
unsigned Idx = countTrailingZeros(NonZeros);
SDValue Item = Op.getOperand(Idx);
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
return SDValue();
// A vector full of immediates; various special cases are already
// handled, so this is best done with a single constant-pool load.
if (IsAllConstants)
return SDValue();
if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
return V;
// See if we can use a vector load to get all of the elements.
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
if (SDValue LD =
EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
return LD;
// If this is a splat of pairs of 32-bit elements, we can use a narrower
// build_vector and broadcast it.
// TODO: We could probably generalize this more.
if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
// Make sure all the even/odd operands match.
for (unsigned i = 2; i != NumElems; ++i)
if (Ops[i % 2] != Op.getOperand(i))
return false;
return true;
if (CanSplat(Op, NumElems, Ops)) {
MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
// Create a new build vector and cast to v2i64/v2f64.
SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
DAG.getBuildVector(NarrowVT, dl, Ops));
// Broadcast from v2i64/v2f64 and cast to final VT.
MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.getSizeInBits() > 128) {
MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
// Build both the lower and upper subvector.
SDValue Lower =
DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
SDValue Upper = DAG.getBuildVector(
HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
// Recreate the wider vector with the lower and upper part.
return concatSubVectors(Lower, Upper, DAG, dl);
// Let legalizer expand 2-wide build_vectors.
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
unsigned Idx = countTrailingZeros(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
return SDValue();
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16)
if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
DAG, Subtarget))
return V;
if (EVTBits == 16 && NumElems == 8)
if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
DAG, Subtarget))
return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
if (EVTBits == 32 && NumElems == 4)
if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
return V;
// If element VT is == 32 bits, turn it into a number of shuffles.
if (NumElems == 4 && NumZero > 0) {
SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1ULL << i));
if (isZero)
Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
for (unsigned i = 0; i < 2; ++i) {
switch ((NonZeros >> (i*2)) & 0x3) {
default: llvm_unreachable("Unexpected NonZero count");
case 0:
Ops[i] = Ops[i*2]; // Must be a zero vector.
case 1:
Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
case 2:
Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
case 3:
Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
bool Reverse1 = (NonZeros & 0x3) == 2;
bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
int MaskVec[] = {
Reverse1 ? 1 : 0,
Reverse1 ? 0 : 1,
static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
static_cast<int>(Reverse2 ? NumElems : NumElems+1)
return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
// Check for a build vector from mostly shuffle plus few inserting.
if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
if (Subtarget.hasSSE41()) {
SDValue Result;
if (!Op.getOperand(0).isUndef())
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
Result = DAG.getUNDEF(VT);
for (unsigned i = 1; i < NumElems; ++i) {
if (Op.getOperand(i).isUndef()) continue;
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
return Result;
// Otherwise, expand into a number of unpckl*, start by extending each of
// our (non-undef) elements to the full vector width with the element in the
// bottom slot of the vector (which generates no code for SSE).
SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < NumElems; ++i) {
if (!Op.getOperand(i).isUndef())
Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
Ops[i] = DAG.getUNDEF(VT);
// Next, we iteratively mix elements, e.g. for v4f32:
// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
// Generate scaled UNPCKL shuffle mask.
SmallVector<int, 16> Mask;
for(unsigned i = 0; i != Scale; ++i)
for (unsigned i = 0; i != Scale; ++i)
Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
return Ops[0];
// 256-bit AVX can use the vinsertf128 instruction
// to create 256-bit vectors from two other 128-bit ones.
// TODO: Detect subvector broadcast here instead of DAG combine?
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
assert((ResVT.is256BitVector() ||
ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
unsigned NumOperands = Op.getNumOperands();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
unsigned NonZeros = 0;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
else {
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
NonZeros |= 1 << i;
// If we have more than 2 non-zeros, build each half separately.
if (NumNonZero > 2) {
MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
// Otherwise, build it up through insert_subvectors.
SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
: DAG.getUNDEF(ResVT);
MVT SubVT = Op.getOperand(0).getSimpleValueType();
unsigned NumSubElems = SubVT.getVectorNumElements();
for (unsigned i = 0; i != NumOperands; ++i) {
if ((NonZeros & (1 << i)) == 0)
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
DAG.getIntPtrConstant(i * NumSubElems, dl));
return Vec;
// Returns true if the given node is a type promotion (by concatenating i1
// zeros) of the result of a node that already zeros all upper bits of
// k-register.
// TODO: Merge this with LowerAVXCONCAT_VECTORS?
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG & DAG) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
unsigned NumOperands = Op.getNumOperands();
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
uint64_t Zeros = 0;
uint64_t NonZeros = 0;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
Zeros |= (uint64_t)1 << i;
NonZeros |= (uint64_t)1 << i;
unsigned NumElems = ResVT.getVectorNumElements();
// If we are inserting non-zero vector and there are zeros in LSBs and undef
// in the MSBs we need to emit a KSHIFTL. The generic lowering to
// insert_subvector will give us two kshifts.
if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
Log2_64(NonZeros) != NumOperands - 1) {
MVT ShiftVT = ResVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
unsigned Idx = Log2_64(NonZeros);
SDValue SubVec = Op.getOperand(Idx);
unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
DAG.getUNDEF(ShiftVT), SubVec,
DAG.getIntPtrConstant(0, dl));
Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
DAG.getIntPtrConstant(0, dl));
// If there are zero or one non-zeros we can handle this very simply.
if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
if (!NonZeros)
return Vec;
unsigned Idx = Log2_64(NonZeros);
SDValue SubVec = Op.getOperand(Idx);
unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
if (NumOperands > 2) {
MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
if (ResVT.getVectorNumElements() >= 16)
return Op; // The operation is legal with KUNPCK
SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
DAG.getUNDEF(ResVT), Op.getOperand(0),
DAG.getIntPtrConstant(0, dl));
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
DAG.getIntPtrConstant(NumElems/2, dl));
static SDValue LowerCONCAT_VECTORS(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (VT.getVectorElementType() == MVT::i1)
return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
// AVX can use the vinsertf128 instruction to create 256-bit vectors
// from two other 128-bit ones.
// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
// Vector shuffle lowering
// This is an experimental code path for lowering vector shuffles on x86. It is
// designed to handle arbitrary vector shuffles and blends, gracefully
// degrading performance as necessary. It works hard to recognize idiomatic
// shuffles and lower them to optimal instruction patterns without leaving
// a framework that allows reasonably efficient handling of all vector shuffle
// patterns.
/// Tiny helper function to identify a no-op mask.
/// This is a somewhat boring predicate function. It checks whether the mask
/// array input, which is assumed to be a single-input shuffle mask of the kind
/// used by the X86 shuffle instructions (not a fully general
/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
/// in-place shuffle are 'no-op's.
static bool isNoopShuffleMask(ArrayRef<int> Mask) {
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] >= 0 && Mask[i] != i)
return false;
return true;
/// Test whether there are elements crossing LaneSizeInBits lanes in this
/// shuffle mask.
/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
/// and we routinely test for these.
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
unsigned ScalarSizeInBits,
ArrayRef<int> Mask) {
assert(LaneSizeInBits && ScalarSizeInBits &&
(LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size");
int LaneSize = LaneSizeInBits / ScalarSizeInBits;
int Size = Mask.size();
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
return true;
return false;
/// Test whether there are elements crossing 128-bit lanes in this
/// shuffle mask.
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
/// Test whether a shuffle mask is equivalent within each sub-lane.
/// This checks a shuffle mask to see if it is performing the same
/// lane-relative shuffle in each sub-lane. This trivially implies
/// that it is also not lane-crossing. It may however involve a blend from the
/// same lane of a second vector.
/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
/// non-trivial to compute in the face of undef lanes. The representation is
/// suitable for use with existing 128-bit shuffles as entries from the second
/// vector have been remapped to [LaneSize, 2*LaneSize).
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
RepeatedMask.assign(LaneSize, -1);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
if (Mask[i] < 0)
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
// Adjust second vector indices to start at LaneSize instead of Size.
int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
: Mask[i] % LaneSize + LaneSize;
if (RepeatedMask[i % LaneSize] < 0)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
else if (RepeatedMask[i % LaneSize] != LocalM)
// Found a mismatch with the repeated mask.
return false;
return true;
/// Test whether a shuffle mask is equivalent within each 128-bit lane.
static bool
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
static bool
is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
SmallVector<int, 32> RepeatedMask;
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
/// Test whether a shuffle mask is equivalent within each 256-bit lane.
static bool
is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
/// Test whether a target shuffle mask is equivalent within each sub-lane.
/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
RepeatedMask.assign(LaneSize, SM_SentinelUndef);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
if (Mask[i] == SM_SentinelUndef)
if (Mask[i] == SM_SentinelZero) {
if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
return false;
RepeatedMask[i % LaneSize] = SM_SentinelZero;
if ((Mask[i] % Size) / LaneSize != i / LaneSize)
// This entry crosses lanes, so there is no way to model this shuffle.
return false;
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
// Adjust second vector indices to start at LaneSize instead of Size.
int LocalM =
Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
else if (RepeatedMask[i % LaneSize] != LocalM)
// Found a mismatch with the repeated mask.
return false;
return true;
/// Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
/// This is a fast way to test a shuffle mask against a fixed pattern:
/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask) {
if (Mask.size() != ExpectedMask.size())
return false;
int Size = Mask.size();
// If the values are build vectors, we can look through them to find
// equivalent inputs that make the shuffles equivalent.
auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
for (int i = 0; i < Size; ++i) {
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
if (!MaskBV || !ExpectedBV ||
MaskBV->getOperand(Mask[i] % Size) !=
ExpectedBV->getOperand(ExpectedMask[i] % Size))
return false;
return true;
/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
/// The masks must be exactly the same width.
/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
/// value in ExpectedMask is always accepted. Otherwise the indices must match.
/// SM_SentinelZero is accepted as a valid negative index but must match in
/// both.
static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask,
SDValue V1 = SDValue(),
SDValue V2 = SDValue()) {
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask");
// Check for out-of-range target shuffle mask indices.
if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
return false;
// If the values are build vectors, we can look through them to find
// equivalent inputs that make the shuffles equivalent.
auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
for (int i = 0; i < Size; ++i) {
if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
if (MaskBV && ExpectedBV &&
MaskBV->getOperand(Mask[i] % Size) ==
ExpectedBV->getOperand(ExpectedMask[i] % Size))
// TODO - handle SM_Sentinel equivalences.
return false;
return true;
// Attempt to create a shuffle mask from a VSELECT condition mask.
static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
SDValue Cond) {
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return false;
unsigned Size = Cond.getValueType().getVectorNumElements();
Mask.resize(Size, SM_SentinelUndef);
for (int i = 0; i != (int)Size; ++i) {
SDValue CondElt = Cond.getOperand(i);
Mask[i] = i;
// Arbitrarily choose from the 2nd operand if the select condition element
// is undef.
// TODO: Can we do better by matching patterns such as even/odd?
if (CondElt.isUndef() || isNullConstant(CondElt))
Mask[i] += Size;
return true;
// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
// instructions.
static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
if (VT != MVT::v8i32 && VT != MVT::v8f32)
return false;
SmallVector<int, 8> Unpcklwd;
createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
/* Unary = */ false);
SmallVector<int, 8> Unpckhwd;
createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
/* Unary = */ false);
bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
isTargetShuffleEquivalent(Mask, Unpckhwd));
return IsUnpackwdMask;
static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
// Create 128-bit vector type based on mask size.
MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
MVT VT = MVT::getVectorVT(EltVT, Mask.size());
// We can't assume a canonical shuffle mask, so try the commuted version too.
SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
// Match any of unary/binary or low/high.
for (unsigned i = 0; i != 4; ++i) {
SmallVector<int, 16> UnpackMask;
createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
isTargetShuffleEquivalent(CommutedMask, UnpackMask))
return true;
return false;
/// Return true if a shuffle mask chooses elements identically in its top and
/// bottom halves. For example, any splat mask has the same top and bottom
/// halves. If an element is undefined in only one half of the mask, the halves
/// are not considered identical.
static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
unsigned HalfSize = Mask.size() / 2;
for (unsigned i = 0; i != HalfSize; ++i) {
if (Mask[i] != Mask[i + HalfSize])
return false;
return true;
/// Get a 4-lane 8-bit shuffle immediate for a mask.
/// This helper function produces an 8-bit shuffle immediate corresponding to
/// the ubiquitous shuffle encoding scheme used in x86 instructions for
/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
/// example.
/// NB: We rely heavily on "undef" masks preserving the input lane.
static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
unsigned Imm = 0;
Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
return Imm;
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
SelectionDAG &DAG) {
return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
// The Shuffle result is as follow:
// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
// Each Zeroable's element correspond to a particular Mask's element.
// As described in computeZeroableShuffleElements function.
// The function looks for a sub-mask that the nonzero elements are in
// increasing order. If such sub-mask exist. The function returns true.
static bool isNonZeroElementsInOrder(const APInt &Zeroable,
ArrayRef<int> Mask, const EVT &VectorType,
bool &IsZeroSideLeft) {
int NextElement = -1;
// Check if the Mask's nonzero elements are in increasing order.
for (int i = 0, e = Mask.size(); i < e; i++) {
// Checks if the mask's zeros elements are built from only zeros.
assert(Mask[i] >= -1 && "Out of bound mask element!");
if (Mask[i] < 0)
return false;
if (Zeroable[i])
// Find the lowest non zero element
if (NextElement < 0) {
NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
IsZeroSideLeft = NextElement != 0;
// Exit if the mask's non zero elements are not in increasing order.
if (NextElement != Mask[i])
return false;
return true;
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
int LaneSize = 128 / VT.getScalarSizeInBits();
const int NumBytes = VT.getSizeInBits() / 8;
const int NumEltBytes = VT.getScalarSizeInBits() / 8;
assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
(Subtarget.hasAVX2() && VT.is256BitVector()) ||
(Subtarget.hasBWI() && VT.is512BitVector()));
SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
// Sign bit set in i8 mask means zero element.
SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
SDValue V;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / NumEltBytes];
if (M < 0) {
PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
if (Zeroable[i / NumEltBytes]) {
PSHUFBMask[i] = ZeroMask;
// We can only use a single input of V1 or V2.
SDValue SrcV = (M >= Size ? V2 : V1);
if (V && V != SrcV)
return SDValue();
V = SrcV;
M %= Size;
// PSHUFB can't cross lanes, ensure this doesn't happen.
if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
return SDValue();
M = M % LaneSize;
M = M * NumEltBytes + (i % NumEltBytes);
PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
assert(V && "Failed to find a source input");
MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl);
// X86 has dedicated shuffle that can be lowered to VEXPAND
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
const APInt &Zeroable,
ArrayRef<int> Mask, SDValue &V1,
SDValue &V2, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
bool IsLeftZeroSide = true;
if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
return SDValue();
unsigned VEXPANDMask = (~Zeroable).getZExtValue();
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
unsigned NumElts = VT.getVectorNumElements();
assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
"Unexpected number of vector elements");
SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
unsigned &UnpackOpcode, bool IsUnary,
ArrayRef<int> TargetMask, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
for (int i = 0; i != NumElts; i += 2) {
int M1 = TargetMask[i + 0];
int M2 = TargetMask[i + 1];
Undef1 &= (SM_SentinelUndef == M1);
Undef2 &= (SM_SentinelUndef == M2);
Zero1 &= isUndefOrZero(M1);
Zero2 &= isUndefOrZero(M2);
assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected");
// Attempt to match the target mask against the unpack lo/hi mask patterns.
SmallVector<int, 64> Unpckl, Unpckh;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
UnpackOpcode = X86ISD::UNPCKL;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
return true;
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
UnpackOpcode = X86ISD::UNPCKH;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
return true;
// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
if (IsUnary && (Zero1 || Zero2)) {
// Don't bother if we can blend instead.
if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
return false;
bool MatchLo = true, MatchHi = true;
for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
int M = TargetMask[i];
// Ignore if the input is known to be zero or the index is undef.
if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
(M == SM_SentinelUndef))
MatchLo &= (M == Unpckl[i]);
MatchHi &= (M == Unpckh[i]);
if (MatchLo || MatchHi) {
UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
return true;
// If a binary shuffle, commute and try again.
if (!IsUnary) {
if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
UnpackOpcode = X86ISD::UNPCKL;
std::swap(V1, V2);
return true;
if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
UnpackOpcode = X86ISD::UNPCKH;
std::swap(V1, V2);
return true;
return false;
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1, SDValue V2,
SelectionDAG &DAG) {
SmallVector<int, 8> Unpckl;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
SmallVector<int, 8> Unpckh;
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
// Commute and try again.
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
return SDValue();
/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
/// followed by unpack 256-bit.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SmallVector<int, 32> Unpckl, Unpckh;
createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
unsigned UnpackOpcode;
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
UnpackOpcode = X86ISD::UNPCKL;
else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
UnpackOpcode = X86ISD::UNPCKH;
return SDValue();
// This is a "natural" unpack operation (rather than the 128-bit sectored
// operation implemented by AVX). We need to rearrange 64-bit chunks of the
// input in order to use the x86 instruction.
V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
V1 = DAG.getBitcast(VT, V1);
return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
// source into the lower elements and zeroing the upper elements.
// TODO: Merge with matchShuffleAsVPMOV.
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
ArrayRef<int> Mask, const APInt &Zeroable,
const X86Subtarget &Subtarget) {
if (!VT.is512BitVector() && !Subtarget.hasVLX())
return false;
unsigned NumElts = Mask.size();
unsigned EltSizeInBits = VT.getScalarSizeInBits();
unsigned MaxScale = 64 / EltSizeInBits;
for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
unsigned SrcEltBits = EltSizeInBits * Scale;
if (SrcEltBits < 32 && !Subtarget.hasBWI())
unsigned NumSrcElts = NumElts / Scale;
if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
unsigned UpperElts = NumElts - NumSrcElts;
if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
DstVT = MVT::getIntegerVT(EltSizeInBits);
if ((NumSrcElts * EltSizeInBits) >= 128) {
DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
} else {
DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
return true;
return false;
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
int Delta) {
int Size = (int)Mask.size();
int Split = Size / Delta;
int TruncatedVectorStart = SwappedOps ? Size : 0;
// Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
return false;
// The rest of the mask should not refer to the truncated vector's elements.
if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
TruncatedVectorStart + Size))
return false;
return true;
// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
// An example is the following:
// t0: ch = EntryToken
// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
// t25: v4i32 = truncate t2
// t41: v8i16 = bitcast t25
// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
// t18: v2i64 = bitcast t51
// Without avx512vl, this is lowered to:
// vpmovqd %zmm0, %ymm0
// vpshufb {{.*#+}} xmm0 =
// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
// But when avx512vl is available, one can just use a single vpmovdw
// instruction.
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (VT != MVT::v16i8 && VT != MVT::v8i16)
return SDValue();
if (Mask.size() != VT.getVectorNumElements())
return SDValue();
bool SwappedOps = false;
if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
if (!ISD::isBuildVectorAllZeros(V1.getNode()))
return SDValue();
std::swap(V1, V2);
SwappedOps = true;
// Look for:
// bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
// bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
// and similar ones.
if (V1.getOpcode() != ISD::BITCAST)
return SDValue();
if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
return SDValue();
SDValue Src = V1.getOperand(0).getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
// The vptrunc** instructions truncating 128 bit and 256 bit vectors
// are only available with avx512vl.
if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
return SDValue();
// Down Convert Word to Byte is only available with avx512bw. The case with
// 256-bit output doesn't contain a shuffle and is therefore not handled here.
if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
return SDValue();
// The first half/quarter of the mask should refer to every second/fourth
// element of the vector truncated and bitcasted.
if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
!matchShuffleAsVPMOV(Mask, SwappedOps, 4))
return SDValue();
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
/// Check whether a compaction lowering can be done by dropping even
/// elements and compute how many times even elements must be dropped.
/// This handles shuffles which take every Nth element where N is a power of
/// two. Example shuffle masks:
/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
/// Any of these lanes can of course be undef.
/// This routine only supports N <= 3.
/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
/// for larger N.
/// \returns N above, or the number of times even elements must be dropped if
/// there is such a number. Otherwise returns zero.
static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
bool IsSingleInput) {
// The modulus for the shuffle vector entries is based on whether this is
// a single input or not.
int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
"We should only be called with masks with a power-of-2 size!");
uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
// and 2^3 simultaneously. This is because we may have ambiguity with
// partially undef inputs.
bool ViableForN[3] = {true, true, true};
for (int i = 0, e = Mask.size(); i < e; ++i) {
// Ignore undef lanes, we'll optimistically collapse them to the pattern we
// want.
if (Mask[i] < 0)
bool IsAnyViable = false;
for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
if (ViableForN[j]) {
uint64_t N = j + 1;
// The shuffle mask must be equal to (i * 2^N) % M.
if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
IsAnyViable = true;
ViableForN[j] = false;
// Early exit if we exhaust the possible powers of two.
if (!IsAnyViable)
for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
if (ViableForN[j])
return j + 1;
// Return 0 as there is no viable power of two.
return 0;
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
// Checks for compaction shuffle masks if MaxStages > 1.
// TODO: Add support for matching multiple PACKSS/PACKUS stages.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
unsigned &PackOpcode, ArrayRef<int> TargetMask,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned MaxStages = 1) {
unsigned NumElts = VT.getVectorNumElements();
unsigned BitSize = VT.getScalarSizeInBits();
assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction");
auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
unsigned NumSrcBits = PackVT.getScalarSizeInBits();
unsigned NumPackedBits = NumSrcBits - BitSize;
SDValue VV1 = DAG.getBitcast(PackVT, N1);
SDValue VV2 = DAG.getBitcast(PackVT, N2);
if (Subtarget.hasSSE41() || BitSize == 8) {
APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
V1 = VV1;
V2 = VV2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKUS;
return true;
if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
(N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
V1 = VV1;
V2 = VV2;
SrcVT = PackVT;
PackOpcode = X86ISD::PACKSS;
return true;
return false;
// Attempt to match against wider and wider compaction patterns.
for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
// Try binary shuffle.
SmallVector<int, 32> BinaryMask;
createPackShuffleMask(VT, BinaryMask, false, NumStages);
if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
if (MatchPACK(V1, V2, PackVT))
return true;
// Try unary shuffle.
SmallVector<int, 32> UnaryMask;
createPackShuffleMask(VT, UnaryMask, true, NumStages);
if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
if (MatchPACK(V1, V1, PackVT))
return true;
return false;
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
SDValue V1, SDValue V2, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned PackOpcode;
unsigned SizeBits = VT.getSizeInBits();
unsigned EltBits = VT.getScalarSizeInBits();
unsigned MaxStages = Log2_32(64 / EltBits);
if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
Subtarget, MaxStages))
return SDValue();
unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
// Don't lower multi-stage packs on AVX512, truncation is better.
if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
return SDValue();
// Pack to the largest type possible:
// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
unsigned MaxPackBits = 16;
if (CurrentEltBits > 16 &&
(PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
MaxPackBits = 32;
// Repeatedly pack down to the target size.
SDValue Res;
for (unsigned i = 0; i != NumStages; ++i) {
unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
unsigned NumSrcElts = SizeBits / SrcEltBits;
MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
DAG.getBitcast(SrcVT, V2));
V1 = V2 = Res;
CurrentEltBits /= 2;
assert(Res && Res.getValueType() == VT &&
"Failed to lower compaction shuffle");
return Res;
/// Try to emit a bitmask instruction for a shuffle.
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT MaskVT = VT;
MVT EltVT = VT.getVectorElementType();
SDValue Zero, AllOnes;
// Use f64 if i64 isn't legal.
if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
EltVT = MVT::f64;
MaskVT = MVT::getVectorVT(EltVT, Mask.size());
MVT LogicVT = VT;
if (EltVT == MVT::f32 || EltVT == MVT::f64) {
Zero = DAG.getConstantFP(0.0, DL, EltVT);
APFloat AllOnesValue = APFloat::getAllOnesValue(
SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
LogicVT =
MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
} else {
Zero = DAG.getConstant(0, DL, EltVT);
AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Zeroable[i])
if (Mask[i] % Size != i)
return SDValue(); // Not a blend.
if (!V)
V = Mask[i] < Size ? V1 : V2;
else if (V != (Mask[i] < Size ? V1 : V2))
return SDValue(); // Can only let one input through the mask.
VMaskOps[i] = AllOnes;
if (!V)
return SDValue(); // No non-zeroable elements!
SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
VMask = DAG.getBitcast(LogicVT, VMask);
V = DAG.getBitcast(LogicVT, V);
SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
return DAG.getBitcast(VT, And);
/// Try to emit a blend instruction for a shuffle using bit math.
/// This is used as a fallback approach when first class blend instructions are
/// unavailable. Currently it is only suitable for integer vectors, but could
/// be generalized for floating point vectors if desirable.
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SmallVector<SDValue, 16> MaskOps;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
return SDValue(); // Shuffled input!
MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
return DAG.getNode(ISD::OR, DL, VT, V1, V2);
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG);
static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
MutableArrayRef<int> Mask,
const APInt &Zeroable, bool &ForceV1Zero,
bool &ForceV2Zero, uint64_t &BlendMask) {
bool V1IsZeroOrUndef =
V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZeroOrUndef =
V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
BlendMask = 0;
ForceV1Zero = false, ForceV2Zero = false;
assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
// Attempt to generate the binary blend mask. If an input is zero then
// we can use any lane.
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
if (M == i)
if (M == i + Size) {
BlendMask |= 1ull << i;
if (Zeroable[i]) {
if (V1IsZeroOrUndef) {
ForceV1Zero = true;
Mask[i] = i;
if (V2IsZeroOrUndef) {
ForceV2Zero = true;
BlendMask |= 1ull << i;
Mask[i] = i + Size;
return false;
return true;
static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
int Scale) {
uint64_t ScaledMask = 0;
for (int i = 0; i != Size; ++i)
if (BlendMask & (1ull << i))
ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
return ScaledMask;
/// Try to emit a blend instruction for a shuffle.
/// This doesn't do any checks for the availability of instructions for blending
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
/// that the shuffle mask is a blend, or convertible into a blend with zero.
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Original,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 64> Mask(Original.begin(), Original.end());
if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
return SDValue();
// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
if (ForceV1Zero)
V1 = getZeroVector(VT, Subtarget, DAG, DL);
if (ForceV2Zero)
V2 = getZeroVector(VT, Subtarget, DAG, DL);
switch (VT.SimpleTy) {
case MVT::v4i64:
case MVT::v8i32:
assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
case MVT::v4f64:
case MVT::v8f32:
assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
case MVT::v2f64:
case MVT::v2i64:
case MVT::v4f32:
case MVT::v4i32:
case MVT::v8i16:
assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
DAG.getTargetConstant(BlendMask, DL, MVT::i8));
case MVT::v16i16: {
assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
BlendMask = 0;
for (int i = 0; i < 8; ++i)
if (RepeatedMask[i] >= 8)
BlendMask |= 1ull << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getTargetConstant(BlendMask, DL, MVT::i8));
// Use PBLENDW for lower/upper lanes and then blend lanes.
// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
// merge to VSELECT where useful.
uint64_t LoMask = BlendMask & 0xFF;
uint64_t HiMask = (BlendMask >> 8) & 0xFF;
if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getTargetConstant(LoMask, DL, MVT::i8));
SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getTargetConstant(HiMask, DL, MVT::i8));
return DAG.getVectorShuffle(
MVT::v16i16, DL, Lo, Hi,
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
case MVT::v32i8:
assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
case MVT::v16i8: {
assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Masked;
if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
// If we have VPTERNLOG, we can use that as a bit blend.
if (Subtarget.hasVLX())
if (SDValue BitBlend =
lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return BitBlend;
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
// This form of blend is always done on bytes. Compute the byte vector
// type.
MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
// x86 allows load folding with blendvb from the 2nd source operand. But
// we are still using LLVM select here (see comment below), so that's V1.
// If V2 can be load-folded and V1 cannot be load-folded, then commute to
// allow that load-folding possibility.
if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
std::swap(V1, V2);
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
// mix of LLVM's code generator and the x86 backend. We tell the code
// generator that boolean values in the elements of an x86 vector register
// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
// mapping a select to operand #1, and 'false' mapping to operand #2. The
// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
// of the element (the remaining are ignored) and 0 in that high bit would
// mean operand #1 while 1 in the high bit would mean operand #2. So while
// the LLVM model for boolean values in vector elements gets the relevant
// bit set, it is set backwards and over constrained relative to x86's
// actual model.
SmallVector<SDValue, 32> VSELECTMask;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
for (int j = 0; j < Scale; ++j)
Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
return DAG.getBitcast(
DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
V1, V2));
case MVT::v16f32:
case MVT::v8f64:
case MVT::v8i64:
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8: {
// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
bool OptForSize = DAG.shouldOptForSize();
if (!OptForSize) {
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Masked;
// Otherwise load an immediate into a GPR, cast to k-register, and use a
// masked move.
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
llvm_unreachable("Not a supported integer vector type!");
/// Try to lower as a blend of elements from two inputs followed by
/// a single-input permutation.
/// This matches the pattern where we can blend elements from two inputs and
/// then reduce the shuffle to a single-input permutation.
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG,
bool ImmBlends = false) {
// We build up the blend mask while checking whether a blend is a viable way
// to reduce the shuffle.
SmallVector<int, 32> BlendMask(Mask.size(), -1);
SmallVector<int, 32> PermuteMask(Mask.size(), -1);
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] < 0)
assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
if (BlendMask[Mask[i] % Size] < 0)
BlendMask[Mask[i] % Size] = Mask[i];
else if (BlendMask[Mask[i] % Size] != Mask[i])
return SDValue(); // Can't blend in the needed input!
PermuteMask[i] = Mask[i] % Size;
// If only immediate blends, then bail if the blend mask can't be widened to
// i16.
unsigned EltSize = VT.getScalarSizeInBits();
if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
return SDValue();
SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
/// Try to lower as an unpack of elements from two inputs followed by
/// a single-input permutation.
/// This matches the pattern where we can unpack elements from two inputs and
/// then reduce the shuffle to a single-input (wider) permutation.
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
int NumHalfLaneElts = NumLaneElts / 2;
bool MatchLo = true, MatchHi = true;
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
// Determine UNPCKL/UNPCKH type and operand order.
for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
SDValue &Op = Ops[Elt & 1];
if (M < NumElts && (Op.isUndef() || Op == V1))
Op = V1;
else if (NumElts <= M && (Op.isUndef() || Op == V2))
Op = V2;
return SDValue();
int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
if (!MatchLo && !MatchHi)
return SDValue();
assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
// Now check that each pair of elts come from the same unpack pair
// and set the permute mask based on each pair.
// TODO - Investigate cases where we permute individual elements.
SmallVector<int, 32> PermuteMask(NumElts, -1);
for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
int M0 = Mask[Lane + Elt + 0];
int M1 = Mask[Lane + Elt + 1];
if (0 <= M0 && 0 <= M1 &&
(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
return SDValue();
if (0 <= M0)
PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
if (0 <= M1)
PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
/// permuting the elements of the result in place.
static SDValue lowerShuffleAsByteRotateAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
(VT.is256BitVector() && !Subtarget.hasAVX2()) ||
(VT.is512BitVector() && !Subtarget.hasBWI()))
return SDValue();
// We don't currently support lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(VT, Mask))
return SDValue();
int Scale = VT.getScalarSizeInBits() / 8;
int NumLanes = VT.getSizeInBits() / 128;
int NumElts = VT.getVectorNumElements();
int NumEltsPerLane = NumElts / NumLanes;
// Determine range of mask elts.
bool Blend1 = true;
bool Blend2 = true;
std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
if (M < NumElts) {
Blend1 &= (M == (Lane + Elt));
assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
M = M % NumEltsPerLane;
Range1.first = std::min(Range1.first, M);
Range1.second = std::max(Range1.second, M);
} else {
M -= NumElts;
Blend2 &= (M == (Lane + Elt));
assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
M = M % NumEltsPerLane;
Range2.first = std::min(Range2.first, M);
Range2.second = std::max(Range2.second, M);
// Bail if we don't need both elements.
// TODO - it might be worth doing this for unary shuffles if the permute
// can be widened.
if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
!(0 <= Range2.first && Range2.second < NumEltsPerLane))
return SDValue();
if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
return SDValue();
// Rotate the 2 ops so we can access both ranges, then permute the result.
auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
SDValue Rotate = DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
DAG.getBitcast(ByteVT, Lo),
DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
int M = Mask[Lane + Elt];
if (M < 0)
if (M < NumElts)
PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
// Check if the ranges are small enough to rotate from either direction.
if (Range2.second < Range1.first)
return RotateAndPermute(V1, V2, Range1.first, 0);
if (Range1.second < Range2.first)
return RotateAndPermute(V2, V1, Range2.first, NumElts);
return SDValue();
/// Generic routine to decompose a shuffle and blend into independent
/// blends and permutes.
/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
/// blends.
static SDValue lowerShuffleAsDecomposedShuffleBlend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
// Shuffle the input elements into the desired positions in V1 and V2 and
// blend them together.
SmallVector<int, 32> V1Mask(Mask.size(), -1);
SmallVector<int, 32> V2Mask(Mask.size(), -1);
SmallVector<int, 32> BlendMask(Mask.size(), -1);
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (Mask[i] >= 0 && Mask[i] < Size) {
V1Mask[i] = Mask[i];
BlendMask[i] = i;
} else if (Mask[i] >= Size) {
V2Mask[i] = Mask[i] - Size;
BlendMask[i] = i + Size;
// Try to lower with the simpler initial blend/unpack/rotate strategies unless
// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
// the shuffle may be able to fold with a load or other benefit. However, when
// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
// pre-shuffle first is a better strategy.
if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
// Only prefer immediate blends to unpack/rotate.
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
DAG, true))
return BlendPerm;
if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
return UnpackPerm;
if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
DL, VT, V1, V2, Mask, Subtarget, DAG))
return RotatePerm;
// Unpack/rotate failed - try again with variable blends.
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
return BlendPerm;
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
/// Try to lower a vector shuffle as a bit rotation.
/// Look for a repeated rotation pattern in each sub group.
/// Returns a ISD::ROTL element rotation amount or -1 if failed.
static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
int NumElts = Mask.size();
assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
int RotateAmt = -1;
for (int i = 0; i != NumElts; i += NumSubElts) {
for (int j = 0; j != NumSubElts; ++j) {
int M = Mask[i + j];
if (M < 0)
if (!isInRange(M, i, i + NumSubElts))
return -1;
int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
if (0 <= RotateAmt && Offset != RotateAmt)
return -1;
RotateAmt = Offset;
return RotateAmt;
static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
const X86Subtarget &Subtarget,
ArrayRef<int> Mask) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
int MaxSubElts = 64 / EltSizeInBits;
for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
if (RotateAmt < 0)
int NumElts = Mask.size();
MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
return RotateAmt * EltSizeInBits;
return -1;
/// Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// Only XOP + AVX512 targets have bit rotation instructions.
// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
bool IsLegal =
(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
if (!IsLegal && Subtarget.hasSSE3())
return SDValue();
MVT RotateVT;
int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
Subtarget, Mask);
if (RotateAmt < 0)
return SDValue();
// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
// expanded to OR(SRL,SHL), will be more efficient, but if they can
// widen to vXi16 or more then existing lowering should will be better.
if (!IsLegal) {
if ((RotateAmt % 16) == 0)
return SDValue();
// TODO: Use getTargetVShiftByConstNode.
unsigned ShlAmt = RotateAmt;
unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
V1 = DAG.getBitcast(RotateVT, V1);
SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
return DAG.getBitcast(VT, Rot);
SDValue Rot =
DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
return DAG.getBitcast(VT, Rot);
/// Try to match a vector shuffle as an element rotation.
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
ArrayRef<int> Mask) {
int NumElts = Mask.size();
// We need to detect various ways of spelling a rotation:
// [11, 12, 13, 14, 15, 0, 1, 2]
// [-1, 12, 13, 14, -1, -1, 1, -1]
// [-1, -1, -1, -1, -1, -1, 1, 2]
// [ 3, 4, 5, 6, 7, 8, 9, 10]
// [-1, 4, 5, 6, -1, -1, 9, -1]
// [-1, 4, 5, 6, -1, -1, -1, -1]
int Rotation = 0;
SDValue Lo, Hi;
for (int i = 0; i < NumElts; ++i) {
int M = Mask[i];
assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
"Unexpected mask index.");
if (M < 0)
// Determine where a rotated vector would have started.
int StartIdx = i - (M % NumElts);
if (StartIdx == 0)
// The identity rotation isn't interesting, stop.
return -1;
// If we found the tail of a vector the rotation must be the missing
// front. If we found the head of a vector, it must be how much of the
// head.
int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
if (Rotation == 0)
Rotation = CandidateRotation;
else if (Rotation != CandidateRotation)
// The rotations don't match, so we can't match this mask.
return -1;
// Compute which value this mask is pointing at.
SDValue MaskV = M < NumElts ? V1 : V2;
// Compute which of the two target values this index should be assigned
// to. This reflects whether the high elements are remaining or the low
// elements are remaining.
SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
// Either set up this value if we've not encountered it before, or check
// that it remains consistent.
if (!TargetV)
TargetV = MaskV;
else if (TargetV != MaskV)
// This may be a rotation, but it pulls from the inputs in some
// unsupported interleaving.
return -1;
// Check that we successfully analyzed the mask, and normalize the results.
assert(Rotation != 0 && "Failed to locate a viable rotation!");
assert((Lo || Hi) && "Failed to find a rotated input vector!");
if (!Lo)
Lo = Hi;
else if (!Hi)
Hi = Lo;
V1 = Lo;
V2 = Hi;
return Rotation;
/// Try to lower a vector shuffle as a byte rotation.
/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
/// try to generically lower a vector shuffle through such an pattern. It
/// does not check for the profitability of lowering either as PALIGNR or
/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
/// This matches shuffle vectors that look like:
/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
/// Essentially it concatenates V1 and V2, shifts right by some number of
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask) {
// Don't accept any shuffles with zero elements.
if (isAnyZero(Mask))
return -1;
// PALIGNR works on 128-bit lanes.
SmallVector<int, 16> RepeatedMask;
if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return -1;
int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
if (Rotation <= 0)
return -1;
// PALIGNR rotates bytes, so we need to scale the
// rotation based on how many bytes are in the vector lane.
int NumElts = RepeatedMask.size();
int Scale = 16 / NumElts;
return Rotation * Scale;
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
SDValue Lo = V1, Hi = V2;
int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
if (ByteRotation <= 0)
return SDValue();
// Cast the inputs to i8 vector of correct length to match PALIGNR or
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
Lo = DAG.getBitcast(ByteVT, Lo);
Hi = DAG.getBitcast(ByteVT, Hi);
// SSSE3 targets can use the palignr instruction.
if (Subtarget.hasSSSE3()) {
assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
"512-bit PALIGNR requires BWI instructions");
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
assert(VT.is128BitVector() &&
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
assert(ByteVT == MVT::v16i8 &&
"SSE2 rotate lowering only needed for v16i8!");
// Default SSE2 implementation
int LoByteShift = 16 - ByteRotation;
int HiByteShift = ByteRotation;
SDValue LoShift =
DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
SDValue HiShift =
DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
return DAG.getBitcast(VT,
DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
/// Try to lower a vector shuffle as a dword/qword rotation.
/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
/// rotation of the concatenation of two vectors; This routine will
/// try to generically lower a vector shuffle through such an pattern.
/// Essentially it concatenates V1 and V2, shifts right by some number of
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
"Only 32-bit and 64-bit elements are supported!");
// 128/256-bit vectors are only supported with VLX.
assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
&& "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
if (Rotation <= 0)
return SDValue();
return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
DAG.getTargetConstant(Rotation, DL, MVT::i8));
/// Try to lower a vector shuffle as a byte shift sequence.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
assert(VT.is128BitVector() && "Only 128-bit vectors supported");
// We need a shuffle that has zeros at one/both ends and a sequential
// shuffle from one source within.
unsigned ZeroLo = Zeroable.countTrailingOnes();
unsigned ZeroHi = Zeroable.countLeadingOnes();
if (!ZeroLo && !ZeroHi)
return SDValue();
unsigned NumElts = Mask.size();
unsigned Len = NumElts - (ZeroLo + ZeroHi);
if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
return SDValue();
unsigned Scale = VT.getScalarSizeInBits() / 8;
ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
if (!isUndefOrInRange(StubMask, 0, NumElts) &&
!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
return SDValue();
SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
Res = DAG.getBitcast(MVT::v16i8, Res);
// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
// inner sequential set of elements, possibly offset:
// 01234567 --> zzzzzz01 --> 1zzzzzzz
// 01234567 --> 4567zzzz --> zzzzz456
// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
if (ZeroLo == 0) {
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
} else if (ZeroHi == 0) {
unsigned Shift = Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
} else if (!Subtarget.hasSSSE3()) {
// If we don't have PSHUFB then its worth avoiding an AND constant mask
// by performing 3 byte shifts. Shuffle combining can kick in above that.
// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Shift += Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
} else
return SDValue();
return DAG.getBitcast(VT, Res);
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
/// matches elements from one of the input vectors shuffled to the left or
/// right with zeroable elements 'shifted in'. It handles both the strictly
/// bit-wise element shifts and the byte shift across an entire 128-bit double
/// quad word lane.
/// PSHL : (little-endian) left bit shift.
/// [ zz, 0, zz, 2 ]
/// [ -1, 4, zz, -1 ]
/// PSRL : (little-endian) right bit shift.
/// [ 1, zz, 3, zz]
/// [ -1, -1, 7, zz]
/// PSLLDQ : (little-endian) left byte shift
/// [ zz, 0, 1, 2, 3, 4, 5, 6]
/// [ zz, zz, -1, -1, 2, 3, 4, -1]
/// [ zz, zz, zz, zz, zz, zz, -1, 1]
/// PSRLDQ : (little-endian) right byte shift
/// [ 5, 6, 7, zz, zz, zz, zz, zz]
/// [ -1, 5, 6, 7, zz, zz, zz, zz]
/// [ 1, 2, -1, -1, -1, -1, zz, zz]
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
unsigned ScalarSizeInBits, ArrayRef<int> Mask,
int MaskOffset, const APInt &Zeroable,
const X86Subtarget &Subtarget) {
int Size = Mask.size();
unsigned SizeInBits = Size * ScalarSizeInBits;
auto CheckZeros = [&](int Shift, int Scale, bool Left) {
for (int i = 0; i < Size; i += Scale)
for (int j = 0; j < Shift; ++j)
if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
return false;
return true;
auto MatchShift = [&](int Shift, int Scale, bool Left) {
for (int i = 0; i != Size; i += Scale) {
unsigned Pos = Left ? i + Shift : i;
unsigned Low = Left ? i : i + Shift;
unsigned Len = Scale - Shift;
if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
return -1;
int ShiftEltBits = ScalarSizeInBits * Scale;
bool ByteShift = ShiftEltBits > 64;
Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
// Normalize the scale for byte shifts to still produce an i64 element
// type.
Scale = ByteShift ? Scale / 2 : Scale;
// We need to round trip through the appropriate type for the shift.
MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
: MVT::getVectorVT(ShiftSVT, Size / Scale);
return (int)ShiftAmt;
// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
// keep doubling the size of the integer elements up to that. We can
// then shift the elements of the integer vector by whole multiples of
// their width within the elements of the larger integer vector. Test each
// multiple to see if we can find a match with the moved element indices
// and that the shifted in elements are all zeroable.
unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
for (int Shift = 1; Shift != Scale; ++Shift)
for (bool Left : {true, false})
if (CheckZeros(Shift, Scale, Left)) {
int ShiftAmt = MatchShift(Shift, Scale, Left);
if (0 < ShiftAmt)
return ShiftAmt;
// no match
return -1;
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
MVT ShiftVT;
SDValue V = V1;
unsigned Opcode;
// Try to match shuffle against V1 shift.
int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
Mask, 0, Zeroable, Subtarget);
// If V1 failed, try to match shuffle against V2 shift.
if (ShiftAmt < 0) {
ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
Mask, Size, Zeroable, Subtarget);
V = V2;
if (ShiftAmt < 0)
return SDValue();
assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type");
V = DAG.getBitcast(ShiftVT, V);
V = DAG.getNode(Opcode, DL, ShiftVT, V,
DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
return DAG.getBitcast(VT, V);
// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
// Remainder of lower half result is zero and upper half is all undef.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask, uint64_t &BitLen,
uint64_t &BitIdx, const APInt &Zeroable) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
if (!isUndefUpperHalf(Mask))
return false;
// Determine the extraction length from the part of the
// lower half that isn't zeroable.
int Len = HalfSize;
for (; Len > 0; --Len)
if (!Zeroable[Len - 1])
assert(Len > 0 && "Zeroable shuffle mask");
// Attempt to match first Len sequential elements from the lower half.
SDValue Src;
int Idx = -1;
for (int i = 0; i != Len; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
SDValue &V = (M < Size ? V1 : V2);
M = M % Size;
// The extracted elements must start at a valid index and all mask
// elements must be in the lower half.
if (i > M || M >= HalfSize)
return false;
if (Idx < 0 || (Src == V && Idx == (M - i))) {
Src = V;
Idx = M - i;
return false;
if (!Src || Idx < 0)
return false;
assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
V1 = Src;
return true;
// INSERTQ: Extract lowest Len elements from lower half of second source and
// insert over first source, starting at Idx.
// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask, uint64_t &BitLen,
uint64_t &BitIdx) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
// Upper half must be undefined.
if (!isUndefUpperHalf(Mask))
return false;
for (int Idx = 0; Idx != HalfSize; ++Idx) {
SDValue Base;
// Attempt to match first source from mask before insertion point.
if (isUndefInRange(Mask, 0, Idx)) {
/* EMPTY */
} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
Base = V1;
} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
Base = V2;
} else {
// Extend the extraction length looking to match both the insertion of
// the second source and the remaining elements of the first.
for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
SDValue Insert;
int Len = Hi - Idx;
// Match insertion.
if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
Insert = V1;
} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
Insert = V2;
} else {
// Match the remaining elements of the lower half.
if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
/* EMPTY */
} else if ((!Base || (Base == V1)) &&
isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
Base = V1;
} else if ((!Base || (Base == V2)) &&
isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
Size + Hi)) {
Base = V2;
} else {
BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
V1 = Base;
V2 = Insert;
return true;
return false;
/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG) {
uint64_t BitLen, BitIdx;
if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
DAG.getTargetConstant(BitLen, DL, MVT::i8),
DAG.getTargetConstant(BitIdx, DL, MVT::i8));
if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
V2 ? V2 : DAG.getUNDEF(VT),
DAG.getTargetConstant(BitLen, DL, MVT::i8),
DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return SDValue();
/// Lower a vector shuffle as a zero or any extension.
/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available
/// features of the subtarget. The extended elements are consecutive and
/// begin and can start from an offsetted element index in the input; to
/// avoid excess shuffling the offset must either being in the bottom lane
/// or at the start of a higher lane. All extended elements must be from
/// the same lane.
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
int EltBits = VT.getScalarSizeInBits();
int NumElements = VT.getVectorNumElements();
int NumEltsPerLane = 128 / EltBits;
int OffsetLane = Offset / NumEltsPerLane;
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
assert(0 <= Offset && "Extension offset must be positive.");
assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
"Extension offset must be in the first lane or start an upper lane.");
// Check that an index is in same lane as the base offset.
auto SafeOffset = [&](int Idx) {
return OffsetLane == (Idx / NumEltsPerLane);
// Shift along an input so that the offset base moves to the first element.
auto ShuffleOffset = [&](SDValue V) {
if (!Offset)
return V;
SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
for (int i = 0; i * Scale < NumElements; ++i) {
int SrcIdx = i + Offset;
ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
// Found a valid a/zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget.hasSSE41()) {
// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
if (Offset && Scale == 2 && VT.is128BitVector())
return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
// For any extends we can cheat for larger element sizes and use shuffle
// instructions that can fold with a load and/or copy.
if (AnyExt && EltBits == 32) {
int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
if (AnyExt && EltBits == 16 && Scale > 2) {
int PSHUFDMask[4] = {Offset / 2, -1,
SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
int PSHUFWMask[4] = {1, -1, -1, -1};
unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
return DAG.getBitcast(
VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
DAG.getBitcast(MVT::v8i16, InputV),
getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
// to 64-bits.
if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
assert(VT.is128BitVector() && "Unexpected vector width!");
int LoIdx = Offset * EltBits;
SDValue Lo = DAG.getBitcast(
MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
DAG.getTargetConstant(EltBits, DL, MVT::i8),
DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
return DAG.getBitcast(VT, Lo);
int HiIdx = (Offset + 1) * EltBits;
SDValue Hi = DAG.getBitcast(
MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
DAG.getTargetConstant(EltBits, DL, MVT::i8),
DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
return DAG.getBitcast(VT,
DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
// If this would require more than 2 unpack instructions to expand, use
// pshufb when available. We can only use more than 2 unpack instructions
// when zero extending i8 elements which also makes it easier to use pshufb.
if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
assert(NumElements == 16 && "Unexpected byte vector width!");
SDValue PSHUFBMask[16];
for (int i = 0; i < 16; ++i) {
int Idx = Offset + (i / Scale);
if ((i % Scale == 0 && SafeOffset(Idx))) {
PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
PSHUFBMask[i] =
AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
InputV = DAG.getBitcast(MVT::v16i8, InputV);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
// If we are extending from an offset, ensure we start on a boundary that
// we can unpack from.
int AlignToUnpack = Offset % (NumElements / Scale);
if (AlignToUnpack) {
SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
for (int i = AlignToUnpack; i < NumElements; ++i)
ShMask[i - AlignToUnpack] = i;
InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
Offset -= AlignToUnpack;
// Otherwise emit a sequence of unpacks.
do {
unsigned UnpackLoHi = X86ISD::UNPCKL;
if (Offset >= (NumElements / 2)) {
UnpackLoHi = X86ISD::UNPCKH;
Offset -= (NumElements / 2);
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
: getZeroVector(InputVT, Subtarget, DAG, DL);
InputV = DAG.getBitcast(InputVT, InputV);
InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
Scale /= 2;
EltBits *= 2;
NumElements /= 2;
} while (Scale > 1);
return DAG.getBitcast(VT, InputV);
/// Try to lower a vector shuffle as a zero extension on any microarch.
/// This routine will try to do everything in its power to cleverly lower
/// a shuffle which happens to match the pattern of a zero extend. It doesn't
/// check for the profitability of this lowering, it tries to aggressively
/// match this pattern. It will use all of the micro-architectural details it
/// can to emit an efficient lowering. It handles both blends with all-zero
/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
/// masking out later).
/// The reason we have dedicated lowering for zext-style shuffles is that they
/// are both incredibly common and often quite performance sensitive.
static SDValue lowerShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Bits = VT.getSizeInBits();
int NumLanes = Bits / 128;
int NumElements = VT.getVectorNumElements();
int NumEltsPerLane = NumElements / NumLanes;
assert(VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit");
assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
// Define a helper function to check a particular ext-scale and lower to it if
// valid.
auto Lower = [&](int Scale) -> SDValue {
SDValue InputV;
bool AnyExt = true;
int Offset = 0;
int Matches = 0;
for (int i = 0; i < NumElements; ++i) {
int M = Mask[i];
if (M < 0)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
// Each of the extended elements need to be zeroable.
if (!Zeroable[i])
return SDValue();
// We no longer are in the anyext case.
AnyExt = false;
// Each of the base elements needs to be consecutive indices into the
// same input vector.
SDValue V = M < NumElements ? V1 : V2;
M = M % NumElements;
if (!InputV) {
InputV = V;
Offset = M - (i / Scale);
} else if (InputV != V)
return SDValue(); // Flip-flopping inputs.
// Offset must start in the lowest 128-bit lane or at the start of an
// upper lane.
// FIXME: Is it ever worth allowing a negative base offset?
if (!((0 <= Offset && Offset < NumEltsPerLane) ||
(Offset % NumEltsPerLane) == 0))
return SDValue();
// If we are offsetting, all referenced entries must come from the same
// lane.
if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
return SDValue();
if ((M % NumElements) != (Offset + (i / Scale)))
return SDValue(); // Non-consecutive strided elements.
// If we fail to find an input, we have a zero-shuffle which should always
// have already been handled.
// FIXME: Maybe handle this here in case during blending we end up with one?
if (!InputV)
return SDValue();
// If we are offsetting, don't extend if we only match a single input, we
// can always do better by using a basic PSHUF or PUNPCK.
if (Offset != 0 && Matches < 2)
return SDValue();
return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
InputV, Mask, Subtarget, DAG);
// The widest scale possible for extending is to a 64-bit integer.
assert(Bits % 64 == 0 &&
"The number of bits in a vector must be divisible by 64 on x86!");
int NumExtElements = Bits / 64;
// Each iteration, try extending the elements half as much, but into twice as
// many elements.
for (; NumExtElements < NumElements; NumExtElements *= 2) {
assert(NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size.");
if (SDValue V = Lower(NumElements / NumExtElements))
return V;
// General extends failed, but 128-bit vectors may be able to use MOVQ.
if (Bits != 128)
return SDValue();
// Returns one of the source operands if the shuffle can be reduced to a
// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
auto CanZExtLowHalf = [&]() {
for (int i = NumElements / 2; i != NumElements; ++i)
if (!Zeroable[i])
return SDValue();
if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
return V1;
if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
return V2;
return SDValue();
if (SDValue V = CanZExtLowHalf()) {
V = DAG.getBitcast(MVT::v2i64, V);
V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
return DAG.getBitcast(VT, V);
// No viable ext lowering found.
return SDValue();
/// Try to get a scalar value for a specific element of a vector.
/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
SelectionDAG &DAG) {
MVT VT = V.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
V = peekThroughBitcasts(V);
// If the bitcasts shift the element size, we can't extract an equivalent
// element from it.
MVT NewVT = V.getSimpleValueType();
if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
if (V.getOpcode() == ISD::BUILD_VECTOR ||
(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
// Ensure the scalar operand is the same size as the destination.
// FIXME: Add support for scalar truncation where possible.
SDValue S = V.getOperand(Idx);
if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
return DAG.getBitcast(EltVT, S);
return SDValue();
/// Helper to test for a load that can be folded with x86 shuffles.
/// This is particularly important because the set of instructions varies
/// significantly based on whether the operand is a load or not.
static bool isShuffleFoldableLoad(SDValue V) {
V = peekThroughBitcasts(V);
return ISD::isNON_EXTLoad(V.getNode());
/// Try to lower insertion of a single element into a zero vector.
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
static SDValue lowerShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT EltVT = VT.getVectorElementType();
int V2Index =
find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
bool IsV1Zeroable = true;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (i != V2Index && !Zeroable[i]) {
IsV1Zeroable = false;
// Check for a single input from a SCALAR_TO_VECTOR node.
// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
// all the smarts here sunk into that routine. However, the current
// lowering of BUILD_VECTOR makes that nearly impossible until the old
// vector shuffle lowering is dead.
SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
// We need to zext the scalar if it is smaller than an i32.
V2S = DAG.getBitcast(EltVT, V2S);
if (EltVT == MVT::i8 || EltVT == MVT::i16) {
// Using zext to expand a narrow element won't work for non-zero
// insertions.
if (!IsV1Zeroable)
return SDValue();
// Zero-extend directly to i32.
ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
EltVT == MVT::i16) {
// Either not inserting from the low element of the input or the input
// element size is too small to use VZEXT_MOVL to clear the high bits.
return SDValue();
if (!IsV1Zeroable) {
// If V1 can't be treated as a zero vector we have fewer options to lower
// this. We can't support integer vectors or non-zero targets cheaply, and
// the V1 elements can't be permuted in any way.
assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
if (!VT.isFloatingPoint() || V2Index != 0)
return SDValue();
SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
V1Mask[V2Index] = -1;
if (!isNoopShuffleMask(V1Mask))
return SDValue();
if (!VT.is128BitVector())
return SDValue();
// Otherwise, use MOVSD or MOVSS.
assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
"Only two types of floating point element types to handle!");
return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
ExtVT, V1, V2);
// This lowering only works for the low element with floating point vectors.
if (VT.isFloatingPoint() && V2Index != 0)
return SDValue();
V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
if (ExtVT != VT)
V2 = DAG.getBitcast(VT, V2);
if (V2Index != 0) {
// If we have 4 or fewer lanes we can cheaply shuffle the element into
// the desired position. Otherwise it is more efficient to do a vector
// shift left. We know that we can do a vector shift left because all
// the inputs are zero.
if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
V2Shuffle[V2Index] = 0;
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
} else {
V2 = DAG.getBitcast(MVT::v16i8, V2);
V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
V2 = DAG.getBitcast(VT, V2);
return V2;
/// Try to lower broadcast of a single - truncated - integer element,
/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
/// This assumes we have AVX2.
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
int BroadcastIdx,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX2() &&
"We can only lower integer broadcasts with AVX2!");
MVT EltVT = VT.getVectorElementType();
MVT V0VT = V0.getSimpleValueType();
assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
MVT V0EltVT = V0VT.getVectorElementType();
if (!V0EltVT.isInteger())
return SDValue();
const unsigned EltSize = EltVT.getSizeInBits();
const unsigned V0EltSize = V0EltVT.getSizeInBits();
// This is only a truncation if the original element type is larger.
if (V0EltSize <= EltSize)
return SDValue();
assert(((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!");
const unsigned V0Opc = V0.getOpcode();
const unsigned Scale = V0EltSize / EltSize;
const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
return SDValue();
SDValue Scalar = V0.getOperand(V0BroadcastIdx);
// If we're extracting non-least-significant bits, shift so we can truncate.
// Hopefully, we can fold away the trunc/srl/load into the broadcast.
// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
if (const int OffsetIdx = BroadcastIdx % Scale)
Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
/// Test whether this can be lowered with a single SHUFPS instruction.
/// This is used to disable more specialized lowerings when the shufps lowering
/// will happen to be efficient.
static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
// This routine only handles 128-bit shufps.
assert(Mask.size() == 4 && "Unsupported mask size!");
assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
// To lower with a single SHUFPS we need to have the low half and high half
// each requiring a single input.
if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
return false;
if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
return false;
return true;
/// If we are extracting two 128-bit halves of a vector and shuffling the
/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
/// multi-shuffle lowering.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
SDValue N1, ArrayRef<int> Mask,
SelectionDAG &DAG) {
MVT VT = N0.getSimpleValueType();
assert((VT.is128BitVector() &&
(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
"VPERM* family of shuffles requires 32-bit or 64-bit elements");
// Check that both sources are extracts of the same source vector.
if (!N0.hasOneUse() || !N1.hasOneUse() ||
N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
N0.getOperand(0) != N1.getOperand(0))
return SDValue();
SDValue WideVec = N0.getOperand(0);
MVT WideVT = WideVec.getSimpleValueType();
if (!WideVT.is256BitVector())
return SDValue();
// Match extracts of each half of the wide source vector. Commute the shuffle
// if the extract of the low half is N1.
unsigned NumElts = VT.getVectorNumElements();
SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
return SDValue();
// Final bailout: if the mask is simple, we are better off using an extract
// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
// because that avoids a constant load from memory.
if (NumElts == 4 &&
(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
return SDValue();
// Extend the shuffle mask with undef elements.
NewMask.append(NumElts, -1);
// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
// This is free: ymm -> xmm.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
DAG.getIntPtrConstant(0, DL));
/// Try to lower broadcast of a single element.
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
(Subtarget.hasAVX() && VT.isFloatingPoint()) ||
(Subtarget.hasAVX2() && VT.isInteger())))
return SDValue();
// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
unsigned NumEltBits = VT.getScalarSizeInBits();
unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
// Check that the mask is a broadcast.
int BroadcastIdx = getSplatIndex(Mask);
if (BroadcastIdx < 0)
return SDValue();
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast "
"comes from V1.");
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
// TODO: Combine this logic with findEltLoadSrc() used by
// EltsFromConsecutiveLoads().
int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
switch (V.getOpcode()) {
case ISD::BITCAST: {
V = V.getOperand(0);
int OpBitWidth = V.getOperand(0).getValueSizeInBits();
int OpIdx = BitOffset / OpBitWidth;
V = V.getOperand(OpIdx);
BitOffset %= OpBitWidth;
// The extraction index adds to the existing offset.
unsigned EltBitWidth = V.getScalarValueSizeInBits();
unsigned Idx = V.getConstantOperandVal(1);
unsigned BeginOffset = Idx * EltBitWidth;
BitOffset += BeginOffset;
V = V.getOperand(0);
SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
int EltBitWidth = VOuter.getScalarValueSizeInBits();
int Idx = (int)V.getConstantOperandVal(2);
int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
int BeginOffset = Idx * EltBitWidth;
int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
BitOffset -= BeginOffset;
V = VInner;
} else {
V = VOuter;
assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
BroadcastIdx = BitOffset / NumEltBits;
// Do we need to bitcast the source to retrieve the original broadcast index?
bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
// Check if this is a broadcast of a scalar. We special case lowering
// for scalars so that we can more effectively fold with loads.
// If the original value has a larger element type than the shuffle, the
// broadcast element is in essence truncated. Make that explicit to ease
// folding.
if (BitCastSrc && VT.isInteger())
if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
DL, VT, V, BroadcastIdx, Subtarget, DAG))
return TruncBroadcast;
// Also check the simpler case, where we can directly reuse the scalar.
if (!BitCastSrc &&
((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
V = V.getOperand(BroadcastIdx);
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
} else if (ISD::isNormalLoad(V.getNode()) &&
cast<LoadSDNode>(V)->isSimple()) {
// We do not check for one-use of the vector load because a broadcast load
// is expected to be a win for code size, register pressure, and possibly
// uops even if the original vector load is not eliminated.
// Reduce the vector load and shuffle to a broadcasted scalar load.
LoadSDNode *Ld = cast<LoadSDNode>(V);
SDValue BaseAddr = Ld->getOperand(1);
MVT SVT = VT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
// than MOVDDUP.
// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
if (Opcode == X86ISD::VBROADCAST) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {Ld->getChain(), NewAddr};
V = DAG.getMemIntrinsicNode(
Ld->getMemOperand(), Offset, SVT.getStoreSize()));
DAG.makeEquivalentMemoryOrdering(Ld, V);
return DAG.getBitcast(VT, V);
assert(SVT == MVT::f64 && "Unexpected VT!");
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
Ld->getMemOperand(), Offset, SVT.getStoreSize()));
DAG.makeEquivalentMemoryOrdering(Ld, V);
} else if (!BroadcastFromReg) {
// We can't broadcast from a vector register.
return SDValue();
} else if (BitOffset != 0) {
// We can only broadcast from the zero-element of a vector register,
// but it can be advantageous to broadcast from the zero-element of a
// subvector.
if (!VT.is256BitVector() && !VT.is512BitVector())
return SDValue();
// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
if (VT == MVT::v4f64 || VT == MVT::v4i64)
return SDValue();
// Only broadcast the zero-element of a 128-bit subvector.
if ((BitOffset % 128) != 0)
return SDValue();
assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
"Unexpected bit-offset");
assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
"Unexpected vector size");
unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
V = extract128BitVector(V, ExtractIdx, DAG, DL);
if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
DAG.getBitcast(MVT::f64, V));
// If this is a scalar, do the broadcast on this type and bitcast.
if (!V.getValueType().isVector()) {
assert(V.getScalarValueSizeInBits() == NumEltBits &&
"Unexpected scalar size");
MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
// We only support broadcasting from 128-bit vectors to minimize the
// number of patterns we need to deal with in isel. So extract down to
// 128-bits, removing as many bitcasts as possible.
if (V.getValueSizeInBits() > 128)
V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
// Otherwise cast V to a vector with the same element type as VT, but
// possibly narrower than VT. Then perform the broadcast.
unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
// Check for whether we can use INSERTPS to perform the shuffle. We only use
// INSERTPS when the V1 elements are already in the correct locations
// because otherwise we can just always use two SHUFPS instructions which
// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
// perform INSERTPS if a single V1 element is out of place and all V2
// elements are zeroable.
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
unsigned &InsertPSMask,
const APInt &Zeroable,
ArrayRef<int> Mask, SelectionDAG &DAG) {
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
// Attempt to match INSERTPS with one element from VA or VB being
// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
// are updated.
auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
ArrayRef<int> CandidateMask) {
unsigned ZMask = 0;
int VADstIndex = -1;
int VBDstIndex = -1;
bool VAUsedInPlace = false;
for (int i = 0; i < 4; ++i) {
// Synthesize a zero mask from the zeroable elements (includes undefs).
if (Zeroable[i]) {
ZMask |= 1 << i;
// Flag if we use any VA inputs in place.
if (i == CandidateMask[i]) {
VAUsedInPlace = true;
// We can only insert a single non-zeroable element.
if (VADstIndex >= 0 || VBDstIndex >= 0)
return false;
if (CandidateMask[i] < 4) {
// VA input out of place for insertion.
VADstIndex = i;
} else {
// VB input for insertion.
VBDstIndex = i;
// Don't bother if we have no (non-zeroable) element for insertion.
if (VADstIndex < 0 && VBDstIndex < 0)
return false;
// Determine element insertion src/dst indices. The src index is from the
// start of the inserted vector, not the start of the concatenated vector.
unsigned VBSrcIndex = 0;
if (VADstIndex >= 0) {
// If we have a VA input out of place, we use VA as the V2 element
// insertion and don't use the original V2 at all.
VBSrcIndex = CandidateMask[VADstIndex];
VBDstIndex = VADstIndex;
VB = VA;
} else {
VBSrcIndex = CandidateMask[VBDstIndex] - 4;
// If no V1 inputs are used in place, then the result is created only from
// the zero mask and the V2 insertion - so remove V1 dependency.
if (!VAUsedInPlace)
VA = DAG.getUNDEF(MVT::v4f32);
// Update V1, V2 and InsertPSMask accordingly.
V1 = VA;
V2 = VB;
// Insert the V2 element into the desired position.
InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
return true;
if (matchAsInsertPS(V1, V2, Mask))
return true;
// Commute and try again.
SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
if (matchAsInsertPS(V2, V1, CommutedMask))
return true;
return false;
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
ArrayRef<int> Mask, const APInt &Zeroable,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
// Attempt to match the insertps pattern.
unsigned InsertPSMask;
if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
return SDValue();
// Insert the V2 element into the desired position.
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
/// Try to lower a shuffle as a permute of the inputs followed by an
/// UNPCK instruction.
/// This specifically targets cases where we end up with alternating between
/// the two inputs, and so can permute them into something that feeds a single
/// UNPCK instruction. Note that this routine only targets integer vectors
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
static SDValue lowerShuffleAsPermuteAndUnpack(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
"This routine only supports integer vectors.");
assert(VT.is128BitVector() &&
"This routine only works on 128-bit vectors.");
assert(!V2.isUndef() &&
"This routine should only be used when blending two inputs.");
assert(Mask.size() >= 2 && "Single element masks are invalid.");
int Size = Mask.size();
int NumLoInputs =
count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
int NumHiInputs =
count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
bool UnpackLo = NumLoInputs >= NumHiInputs;
auto TryUnpack = [&](int ScalarSize, int Scale) {
SmallVector<int, 16> V1Mask((unsigned)Size, -1);
SmallVector<int, 16> V2Mask((unsigned)Size, -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
// Each element of the unpack contains Scale elements from this mask.
int UnpackIdx = i / Scale;
// We only handle the case where V1 feeds the first slots of the unpack.
// We rely on canonicalization to ensure this is the case.
if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
return SDValue();
// Setup the mask for this input. The indexing is tricky as we have to
// handle the unpack stride.
SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
Mask[i] % Size;
// If we will have to shuffle both inputs to use the unpack, check whether
// we can just unpack first and shuffle the result. If so, skip this unpack.
if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
return SDValue();
// Shuffle the inputs into place.
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
// Cast the inputs to the type we will use to unpack them.
MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
V1 = DAG.getBitcast(UnpackVT, V1);
V2 = DAG.getBitcast(UnpackVT, V2);
// Unpack the inputs and cast the result back to the desired type.
return DAG.getBitcast(
VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
UnpackVT, V1, V2));
// We try each unpack from the largest to the smallest to try and find one
// that fits this mask.
int OrigScalarSize = VT.getScalarSizeInBits();
for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
return Unpack;
// If we're shuffling with a zero vector then we're better off not doing
// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
return SDValue();
// If none of the unpack-rooted lowerings worked (or were profitable) try an
// initial unpack.
if (NumLoInputs == 0 || NumHiInputs == 0) {
assert((NumLoInputs > 0 || NumHiInputs > 0) &&
"We have to have *some* inputs!");
int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
// FIXME: We could consider the total complexity of the permute of each
// possible unpacking. Or at the least we should consider how many
// half-crossings are created.
// FIXME: We could consider commuting the unpacks.
SmallVector<int, 32> PermMask((unsigned)Size, -1);
for (int i = 0; i < Size; ++i) {
if (Mask[i] < 0)
assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
PermMask[i] =
2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
return DAG.getVectorShuffle(
VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
DL, VT, V1, V2),
DAG.getUNDEF(VT), PermMask);
return SDValue();
/// Handle lowering of 2-lane 64-bit floating point shuffles.
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
/// support for floating point shuffles but not integer shuffles. These
/// instructions will incur a domain crossing penalty on some chips though so
/// it is better to avoid lowering through this for integer vectors where
/// possible.
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
if (Subtarget.hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
return DAG.getNode(
X86ISD::SHUFP, DL, MVT::v2f64,
Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
isShuffleEquivalent(V1, V2, Mask, {1, 3}))
if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
// We can either use a special instruction to load over the low double or
// to move just the low double.
return DAG.getNode(
X86ISD::MOVSD, DL, MVT::v2f64, V2,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
if (Subtarget.hasSSE41())
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
return V;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
/// Handle lowering of 2-lane 64-bit integer shuffles.
/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
/// the integer unit to minimize domain crossing penalties. However, for blends
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We have to map the mask as it is actually a v4i32 shuffle instruction.
V1 = DAG.getBitcast(MVT::v4i32, V1);
int WidenedMask[4] = {
std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
return DAG.getBitcast(
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
return Insertion;
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG);
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
// have this problem. It would be really nice if x86 had better shuffles here.
V1 = DAG.getBitcast(MVT::v2f64, V1);
V2 = DAG.getBitcast(MVT::v2f64, V2);
return DAG.getBitcast(MVT::v2i64,
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
/// Lower a vector shuffle using the SHUFPS instruction.
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
/// It makes no assumptions about whether this is the *best* lowering, it simply
/// uses it.
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 1) {
int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
// Compute the index adjacent to V2Index and in the same half by toggling
// the low bit.
int V2AdjIndex = V2Index ^ 1;
if (Mask[V2AdjIndex] < 0) {
// Handles all the cases where we have a single V2 element and an undef.
// This will only ever happen in the high lanes because we commute the
// vector otherwise.
if (V2Index < 2)
std::swap(LowV, HighV);
NewMask[V2Index] -= 4;
} else {
// Handle the case where the V2 element ends up adjacent to a V1 element.
// To make this work, blend them together as the first step.
int V1Index = V2AdjIndex;
int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
// Now proceed to reconstruct the final blend as we have the necessary
// high or low half formed.
if (V2Index < 2) {
LowV = V2;
HighV = V1;
} else {
HighV = V2;
NewMask[V1Index] = 2; // We put the V1 element in V2[2].
NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
} else if (NumV2Elements == 2) {
if (Mask[0] < 4 && Mask[1] < 4) {
// Handle the easy case where we have V1 in the low lanes and V2 in the
// high lanes.
NewMask[2] -= 4;
NewMask[3] -= 4;
} else if (Mask[2] < 4 && Mask[3] < 4) {
// We also handle the reversed case because this utility may get called
// when we detect a SHUFPS pattern but can't easily commute the shuffle to
// arrange things in the right direction.
NewMask[0] -= 4;
NewMask[1] -= 4;
HighV = V1;
LowV = V2;
} else {
// We have a mixture of V1 and V2 in both low and high lanes. Rather than
// trying to place elements directly, just blend them and set up the final
// shuffle to place them.
// The first two blend mask elements are for V1, the second two are for
// V2.
int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
Mask[2] < 4 ? Mask[2] : Mask[3],
(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
// Now we do a normal shuffle of V1 by giving V1 as both operands to
// a blend.
LowV = HighV = V1;
NewMask[0] = Mask[0] < 4 ? 0 : 2;
NewMask[1] = Mask[0] < 4 ? 2 : 0;
NewMask[2] = Mask[2] < 4 ? 1 : 3;
NewMask[3] = Mask[2] < 4 ? 3 : 1;
return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
/// Lower 4-lane 32-bit floating point shuffles.
/// Uses instructions exclusively from the floating point unit to minimize
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Use even/odd duplicate instructions for masks that match their pattern.
if (Subtarget.hasSSE3()) {
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
if (Subtarget.hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
// in SSE1 because otherwise they are widened to v2f64 and never get here.
if (!Subtarget.hasSSE2()) {
if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
// Otherwise, use a straight shuffle of a single input vector. We pass the
// input vector to both operands to simulate this with a SHUFPS.
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (Subtarget.hasSSE41()) {
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use INSERTPS if we can complete the shuffle efficiently.
if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
return V;
if (!isSingleSHUFPSMask(Mask))
if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
V2, Mask, DAG))
return BlendPerm;
// Use low/high mov instructions. These are only valid in SSE1 because
// otherwise they are widened to v2f64 and never get here.
if (!Subtarget.hasSSE2()) {
if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
return V;
// Otherwise fall back to a SHUFPS lowering strategy.
return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
/// Lower 4-lane i32 vector shuffles.
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
// Try to use broadcast unless the mask only has one non-undef element.
if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We coerce the shuffle pattern to be compatible with UNPCK instructions
// but we aren't actually going to use the UNPCK instruction because doing
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
Mask = UnpackLoMask;
else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
Mask = UnpackHiMask;
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
if (!isSingleSHUFPSMask(Mask)) {
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return Unpack;
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would incur if we
// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
// relevant.
SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
return DAG.getBitcast(MVT::v4i32, ShufPS);
/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
/// shuffle lowering, and the most complex part.
/// The lowering strategy is to try to form pairs of input lanes which are
/// targeted at the same half of the final vector, and then use a dword shuffle
/// to place them onto the right half, and finally unpack the paired lanes into
/// their final position.
/// The exact breakdown of how to form these dword pairs and align them on the
/// correct sides is really tricky. See the comments within the function for
/// more of the details.
/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
/// vector, form the analogous 128-bit 8-element Mask.
static SDValue lowerV8I16GeneralSingleInputShuffle(
const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
// Attempt to directly match PSHUFLW or PSHUFHW.
if (isUndefOrInRange(LoMask, 0, 4) &&
isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
if (isUndefOrInRange(HiMask, 4, 8) &&
isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
for (int i = 0; i != 4; ++i)
HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
SmallVector<int, 4> LoInputs;
copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
array_pod_sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
SmallVector<int, 4> HiInputs;
copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
int NumHToL = LoInputs.size() - NumLToL;
int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
int NumHToH = HiInputs.size() - NumLToH;
MutableArrayRef<int> LToLInputs(, NumLToL);
MutableArrayRef<int> LToHInputs(, NumLToH);
MutableArrayRef<int> HToLInputs( + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs( + NumLToH, NumHToH);
// If we are shuffling values from one half - check how many different DWORD
// pairs we need to create. If only 1 or 2 then we can perform this as a
auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
V = DAG.getNode(ShufWOp, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
V = DAG.getBitcast(PSHUFDVT, V);
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
return DAG.getBitcast(VT, V);
if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
int PSHUFDMask[4] = { -1, -1, -1, -1 };
SmallVector<std::pair<int, int>, 4> DWordPairs;
int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
// Collect the different DWORD pairs.
for (int DWord = 0; DWord != 4; ++DWord) {
int M0 = Mask[2 * DWord + 0];
int M1 = Mask[2 * DWord + 1];
M0 = (M0 >= 0 ? M0 % 4 : M0);
M1 = (M1 >= 0 ? M1 % 4 : M1);
if (M0 < 0 && M1 < 0)
bool Match = false;
for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
auto &DWordPair = DWordPairs[j];
if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
PSHUFDMask[DWord] = DOffset + j;
Match = true;
if (!Match) {
PSHUFDMask[DWord] = DOffset + DWordPairs.size();
DWordPairs.push_back(std::make_pair(M0, M1));
if (DWordPairs.size() <= 2) {
DWordPairs.resize(2, std::make_pair(-1, -1));
int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
DWordPairs[1].first, DWordPairs[1].second};
if ((NumHToL + NumHToH) == 0)
return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
if ((NumLToL + NumLToH) == 0)
return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
// to the generic code below. For example:
// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
// and an existing 2-into-2 on the other half. In this case we may have to
// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
// Fortunately, we don't have to handle anything but a 2-into-2 pattern
// because any other situation (including a 3-into-1 or 1-into-3 in the other
// half than the one we target for fixing) will be fixed when we re-enter this
// path. We will also combine away any sequence of PSHUFD instructions that
// result into a single instruction. Here is an example of the tricky case:
// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
// The result is fine to be handled by the generic logic.
auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
int AOffset, int BOffset) {
assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half.");
assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half.");
assert(AToAInputs.size() + BToAInputs.size() == 4 &&
"Must call this with either 3:1 or 1:3 inputs (summing to 4).");
bool ThreeAInputs = AToAInputs.size() == 3;
// Compute the index of dword with only one word among the three inputs in
// a half by taking the sum of the half with three inputs and subtracting
// the sum of the actual three inputs. The difference is the remaining
// slot.
int ADWord = 0, BDWord = 0;
int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
int TripleNonInputIdx =
TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
TripleDWord = TripleNonInputIdx / 2;
// We use xor with one to compute the adjacent DWord to whichever one the
// OneInput is in.
OneInputDWord = (OneInput / 2) ^ 1;
// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
// and BToA inputs. If there is also such a problem with the BToB and AToB
// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
// is essential that we don't *create* a 3<-1 as then we might oscillate.
if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
// Compute how many inputs will be flipped by swapping these DWords. We
// need
// to balance this to ensure we don't form a 3-1 shuffle in the other
// half.
int NumFlippedAToBInputs =
std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
int NumFlippedBToBInputs =
std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
if ((NumFlippedAToBInputs == 1 &&
(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
(NumFlippedBToBInputs == 1 &&
(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
// We choose whether to fix the A half or B half based on whether that
// half has zero flipped inputs. At zero, we may not be able to fix it
// with that half. We also bias towards fixing the B half because that
// will more commonly be the high half, and we have to bias one way.
auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
ArrayRef<int> Inputs) {
int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
// Determine whether the free index is in the flipped dword or the
// unflipped dword based on where the pinned index is. We use this bit
// in an xor to conditionally select the adjacent dword.
int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
if (IsFixIdxInput == IsFixFreeIdxInput)
FixFreeIdx += 1;
IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
assert(IsFixIdxInput != IsFixFreeIdxInput &&
"We need to be changing the number of flipped inputs!");
int PSHUFHalfMask[] = {0, 1, 2, 3};
std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
V = DAG.getNode(
MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
for (int &M : Mask)
if (M >= 0 && M == FixIdx)
M = FixFreeIdx;
else if (M >= 0 && M == FixFreeIdx)
M = FixIdx;
if (NumFlippedBToBInputs != 0) {
int BPinnedIdx =
BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
} else {
assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
int PSHUFDMask[] = {0, 1, 2, 3};
PSHUFDMask[ADWord] = BDWord;
PSHUFDMask[BDWord] = ADWord;
V = DAG.getBitcast(
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
// Adjust the mask to match the new locations of A and B.
for (int &M : Mask)
if (M >= 0 && M/2 == ADWord)
M = 2 * BDWord + M % 2;
else if (M >= 0 && M/2 == BDWord)
M = 2 * ADWord + M % 2;
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
// At this point there are at most two inputs to the low and high halves from
// each half. That means the inputs can always be grouped into dwords and
// those dwords can then be moved to the correct half with a dword shuffle.
// We use at most one low and one high word shuffle to collect these paired
// inputs into dwords, and finally a dword shuffle to place them.
int PSHUFLMask[4] = {-1, -1, -1, -1};
int PSHUFHMask[4] = {-1, -1, -1, -1};
int PSHUFDMask[4] = {-1, -1, -1, -1};
// First fix the masks for all the inputs that are staying in their
// original halves. This will then dictate the targets of the cross-half
// shuffles.
auto fixInPlaceInputs =
[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
MutableArrayRef<int> SourceHalfMask,
MutableArrayRef<int> HalfMask, int HalfOffset) {
if (InPlaceInputs.empty())
if (InPlaceInputs.size() == 1) {
SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
InPlaceInputs[0] - HalfOffset;
PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
if (IncomingInputs.empty()) {
// Just fix all of the in place inputs.
for (int Input : InPlaceInputs) {
SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
PSHUFDMask[Input / 2] = Input / 2;
assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
InPlaceInputs[0] - HalfOffset;
// Put the second input next to the first so that they are packed into
// a dword. We find the adjacent index by toggling the low bit.
int AdjIndex = InPlaceInputs[0] ^ 1;
SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
// Now gather the cross-half inputs and place them into a free dword of
// their target half.
// FIXME: This operation could almost certainly be simplified dramatically to
// look more like the 3-1 fixing operation.
auto moveInputsToRightHalf = [&PSHUFDMask](
MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
int DestOffset) {
auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
int Word) {
int LowWord = Word & ~1;
int HighWord = Word | 1;
return isWordClobbered(SourceHalfMask, LowWord) ||
isWordClobbered(SourceHalfMask, HighWord);
if (IncomingInputs.empty())
if (ExistingInputs.empty()) {
// Map any dwords with inputs from them into the right half.
for (int Input : IncomingInputs) {
// If the source half mask maps over the inputs, turn those into
// swaps and use the swapped lane.
if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
Input - SourceOffset;
// We have to swap the uses in our half mask in one sweep.
for (int &M : HalfMask)
if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
M = Input;
else if (M == Input)
M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
} else {
assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
Input - SourceOffset &&
"Previous placement doesn't match!");
// Note that this correctly re-maps both when we do a swap and when
// we observe the other side of the swap above. We rely on that to
// avoid swapping the members of the input list directly.
Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
// Map the input's dword into the correct half.
if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
Input / 2 &&
"Previous placement doesn't match!");
// And just directly shift any other-half mask elements to be same-half
// as we will have mirrored the dword containing the element into the
// same position within that half.
for (int &M : HalfMask)
if (M >= SourceOffset && M < SourceOffset + 4) {
M = M - SourceOffset + DestOffset;
assert(M >= 0 && "This should never wrap below zero!");
// Ensure we have the input in a viable dword of its current half. This
// is particularly tricky because the original position may be clobbered
// by inputs being moved and *staying* in that half.
if (IncomingInputs.size() == 1) {
if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
SourceHalfMask[InputFixed - SourceOffset] =
IncomingInputs[0] - SourceOffset;
std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
IncomingInputs[0] = InputFixed;
} else if (IncomingInputs.size() == 2) {
if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
// We have two non-adjacent or clobbered inputs we need to extract from
// the source half. To do this, we need to map them into some adjacent
// dword slot in the source mask.
int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
IncomingInputs[1] - SourceOffset};
// If there is a free slot in the source half mask adjacent to one of
// the inputs, place the other input in it. We use (Index XOR 1) to
// compute an adjacent index.
if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
InputsFixed[1] = InputsFixed[0] ^ 1;
} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
InputsFixed[0] = InputsFixed[1] ^ 1;
} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
// The two inputs are in the same DWord but it is clobbered and the
// adjacent DWord isn't used at all. Move both inputs to the free
// slot.
SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
} else {
// The only way we hit this point is if there is no clobbering
// (because there are no off-half inputs to this half) and there is no
// free slot adjacent to one of the inputs. In this case, we have to
// swap an input with a non-input.
for (int i = 0; i < 4; ++i)
assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!");
assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
"Cannot have adjacent inputs here!");
SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
// We also have to update the final source mask in this case because
// it may need to undo the above swap.
for (int &M : FinalSourceHalfMask)
if (M == (InputsFixed[0] ^ 1) + SourceOffset)
M = InputsFixed[1] + SourceOffset;
else if (M == InputsFixed[1] + SourceOffset)
M = (InputsFixed[0] ^ 1) + SourceOffset;
InputsFixed[1] = InputsFixed[0] ^ 1;
// Point everything at the fixed inputs.
for (int &M : HalfMask)
if (M == IncomingInputs[0])
M = InputsFixed[0] + SourceOffset;
else if (M == IncomingInputs[1])
M = InputsFixed[1] + SourceOffset;
IncomingInputs[0] = InputsFixed[0] + SourceOffset;
IncomingInputs[1] = InputsFixed[1] + SourceOffset;
} else {
llvm_unreachable("Unhandled input size!");
// Now hoist the DWord down to the right half.
int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
for (int &M : HalfMask)
for (int Input : IncomingInputs)
if (M == Input)
M = FreeDWord * 2 + Input % 2;
moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
/*SourceOffset*/ 4, /*DestOffset*/ 0);
moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
/*SourceOffset*/ 0, /*DestOffset*/ 4);
// Now enact all the shuffles we've computed to move the inputs into their
// target half.
if (!isNoopShuffleMask(PSHUFLMask))
V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
if (!isNoopShuffleMask(PSHUFHMask))
V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
if (!isNoopShuffleMask(PSHUFDMask))
V = DAG.getBitcast(
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
// At this point, each half should contain all its inputs, and we can then
// just shuffle them into their final position.
assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!");
assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
"Failed to lift all the low half inputs to the high mask!");
// Do a half shuffle for the low mask.
if (!isNoopShuffleMask(LoMask))
V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
// Do a half shuffle with the high mask after shifting its values down.
for (int &M : HiMask)
if (M >= 0)
M -= 4;
if (!isNoopShuffleMask(HiMask))
V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
return V;
/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
/// blend if only one input is used.
static SDValue lowerShuffleAsBlendOfPSHUFBs(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
"Lane crossing shuffle masks not supported");
int NumBytes = VT.getSizeInBits() / 8;
int Size = Mask.size();
int Scale = NumBytes / Size;
SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
V1InUse = false;
V2InUse = false;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / Scale];
if (M < 0)
const int ZeroMask = 0x80;
int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
if (Zeroable[i / Scale])
V1Idx = V2Idx = ZeroMask;
V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
V1InUse |= (ZeroMask != V1Idx);
V2InUse |= (ZeroMask != V2Idx);
MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
if (V1InUse)
V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
DAG.getBuildVector(ShufVT, DL, V1Mask));
if (V2InUse)
V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
DAG.getBuildVector(ShufVT, DL, V2Mask));
// If we need shuffled inputs from both, blend the two.
SDValue V;
if (V1InUse && V2InUse)
V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
V = V1InUse ? V1 : V2;
// Cast the result back to the correct type.
return DAG.getBitcast(VT, V);
/// Generic lowering of 8-lane i16 shuffles.
/// This handles both single-input shuffles and combined shuffle/blends with
/// two inputs. The single input shuffles are immediately delegated to
/// a dedicated lowering routine.
/// The blends are lowered in one of three fundamental ways. If there are few
/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
/// of the input is significantly cheaper when lowered as an interleaving of
/// the two inputs, try to interleave them. Otherwise, blend the low and high
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Try to use bit rotation instructions.
if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
Subtarget, DAG))
return Rotate;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
Subtarget, DAG))
return Rotate;
// Make a copy of the mask so it can be modified.
SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
Subtarget, DAG);
assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
"All single-input shuffles should be canonicalized to be V1-input "
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, DAG))
return V;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
// We have different paths for blend lowering, but they all must use the
// *exact* same predicate.
bool IsBlendSupported = Subtarget.hasSSE41();
if (IsBlendSupported)
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue BitBlend =
lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
// Try to use byte shift instructions to mask.
if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return V;
// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
!Subtarget.hasVLX()) {
SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
// Now pack things back together.
SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
if (NumEvenDrops == 2) {
Result = DAG.getBitcast(MVT::v4i32, Result);
Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
return Result;
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
// can both shuffle and set up the inefficient blend.
if (!IsBlendSupported && Subtarget.hasSSSE3()) {
bool V1InUse, V2InUse;
return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
Zeroable, DAG, V1InUse, V2InUse);
// We can always bit-blend if we have to so the fallback strategy is to
// decompose into single-input permutes and blends.
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG);
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
/// Generic lowering of v16i8 shuffles.
/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
/// detect any complexity reducing interleaving. If that doesn't help, it uses
/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
return V;
// Try to use a zext lowering.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, DAG))
return V;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Try to use bit rotation instructions.
if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
// Notably, this handles splat and partial-splat shuffles more efficiently.
// However, it only makes sense if the pre-duplication shuffle simplifies
// things significantly. Currently, this means we need to be able to
// express the pre-duplication shuffle as an i16 shuffle.
// FIXME: We should check for other patterns which can be widened into an
// i16 shuffle as well.
auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
for (int i = 0; i < 16; i += 2)
if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
return false;
return true;
auto tryToWidenViaDuplication = [&]() -> SDValue {
if (!canWidenViaDuplication(Mask))
return SDValue();
SmallVector<int, 4> LoInputs;
copy_if(Mask, std::back_inserter(LoInputs),
[](int M) { return M >= 0 && M < 8; });
array_pod_sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
SmallVector<int, 4> HiInputs;
copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
array_pod_sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
bool TargetLo = LoInputs.size() >= HiInputs.size();
ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
SmallDenseMap<int, int, 8> LaneMap;
for (int I : InPlaceInputs) {
PreDupI16Shuffle[I/2] = I/2;
LaneMap[I] = I;
int j = TargetLo ? 0 : 4, je = j + 4;
for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
// Check if j is already a shuffle of this input. This happens when
// there are two adjacent bytes after we move the low one.
if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
// If we haven't yet mapped the input, search for a slot into which
// we can map it.
while (j < je && PreDupI16Shuffle[j] >= 0)
if (j == je)
// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
return SDValue();
// Map this input with the i16 shuffle.
PreDupI16Shuffle[j] = MovingInputs[i] / 2;
// Update the lane map based on the mapping we ended up with.
LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
V1 = DAG.getBitcast(
DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
// Unpack the bytes to form the i16s that will be shuffled into place.
bool EvenInUse = false, OddInUse = false;
for (int i = 0; i < 16; i += 2) {
EvenInUse |= (Mask[i + 0] >= 0);
OddInUse |= (Mask[i + 1] >= 0);
if (EvenInUse && OddInUse)
V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
for (int i = 0; i < 16; ++i)
if (Mask[i] >= 0) {
int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
if (PostDupI16Shuffle[i / 2] < 0)
PostDupI16Shuffle[i / 2] = MappedMask;
assert(PostDupI16Shuffle[i / 2] == MappedMask &&
"Conflicting entries in the original shuffle!");
return DAG.getBitcast(
DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
if (SDValue V = tryToWidenViaDuplication())
return V;
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
// Try to use byte shift instructions to mask.
if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return V;
// Check for compaction patterns.
bool IsSingleInput = V2.isUndef();
int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
// blends but after all of the single-input lowerings. If the single input
// lowerings can find an instruction sequence that is faster than a PSHUFB, we
// want to preserve that and we can DAG combine any longer sequences into
// a PSHUFB in the end. But once we start blending from multiple inputs,
// the complexity of DAG combining bad patterns back into PSHUFB is too high,
// and there are *very* few patterns that would actually be faster than the
// PSHUFB approach because of its ability to zero lanes.
// If the mask is a binary compaction, we can more efficiently perform this
// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
// FIXME: The only exceptions to the above are blends which are exact
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
bool V1InUse = false;
bool V2InUse = false;
SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
// If both V1 and V2 are in use and we can use a direct blend or an unpack,
// do so. This avoids using them to handle blends-with-zero which is
// important as a single pshufb is significantly faster for that.
if (V1InUse && V2InUse) {
if (Subtarget.hasSSE41())
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// We can use an unpack to do the blending rather than an or in some
// cases. Even though the or may be (very minorly) more efficient, we
// preference this lowering because there are common cases where part of
// the complexity of the shuffles goes away when we do the final blend as
// an unpack.
// FIXME: It might be worth trying to detect if the unpack-feeding
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
if (SDValue V = lowerShuffleAsByteRotateAndPermute(
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return V;
return PSHUFB;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
if (SDValue V = lowerShuffleAsElementInsertion(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
return Blend;
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
// details.
// We special case these as they can be particularly efficiently handled with
// the PACKUSB instruction on x86 and they show up in common patterns of
// rearranging bytes to truncate wide elements.
if (NumEvenDrops) {
// NumEvenDrops is the power of two stride of the elements. Another way of
// thinking about it is that we need to drop the even elements this many
// times to get the original input.
// First we need to zero all the dropped bytes.
assert(NumEvenDrops <= 3 &&
"No support for dropping even elements more than 3 times.");
SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
if (!IsSingleInput)
V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
// Now pack things back together.
SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
IsSingleInput ? V1 : V2);
for (int i = 1; i < NumEvenDrops; ++i) {
Result = DAG.getBitcast(MVT::v8i16, Result);
Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
return Result;
// Handle multi-input cases by blending single-input shuffles.
if (NumV2Elements > 0)
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
Subtarget, DAG);
// The fallback path for single-input shuffles widens this into two v8i16
// vectors with unpacks, shuffles those, and then pulls them back together
// with a pack.
SDValue V = V1;
std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
for (int i = 0; i < 16; ++i)
if (Mask[i] >= 0)
(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
SDValue VLoHalf, VHiHalf;
// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
// them out and avoid using UNPCK{L,H} to extract the elements of V as
// i16s.
if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
// Use a mask to drop the high bytes.
VLoHalf = DAG.getBitcast(MVT::v8i16, V);
VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
DAG.getConstant(0x00FF, DL, MVT::v8i16));
// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
VHiHalf = DAG.getUNDEF(MVT::v8i16);
// Squash the masks to point directly into VLoHalf.
for (int &M : LoBlendMask)
if (M >= 0)
M /= 2;
for (int &M : HiBlendMask)
if (M >= 0)
M /= 2;
} else {
// Otherwise just unpack the low half of V into VLoHalf and the high half into
// VHiHalf so that we can blend them as i16s.
SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
VLoHalf = DAG.getBitcast(
MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
VHiHalf = DAG.getBitcast(
MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
/// Dispatching routine to lower various 128-bit x86 vector shuffles.
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
switch (VT.SimpleTy) {
case MVT::v2i64:
return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v2f64:
return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i32:
return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4f32:
return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i16:
return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i8:
return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
/// Generic routine to split vector shuffle into half-sized shuffles.
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
/// AVX vector shuffle types.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
assert(V2.getSimpleValueType() == VT && "Bad operand type!");
ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
int NumElements = VT.getVectorNumElements();
int SplitNumElements = NumElements / 2;
MVT ScalarVT = VT.getVectorElementType();
MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
// Use splitVector/extractSubVector so that split build-vectors just build two
// narrower build vectors. This helps shuffling with splats and zeros.
auto SplitVector = [&](SDValue V) {
SDValue LoV, HiV;
std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
return std::make_pair(DAG.getBitcast(SplitVT, LoV),
DAG.getBitcast(SplitVT, HiV));
SDValue LoV1, HiV1, LoV2, HiV2;
std::tie(LoV1, HiV1) = SplitVector(V1);
std::tie(LoV2, HiV2) = SplitVector(V2);
// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
if (M >= NumElements + SplitNumElements)
UseHiV2 = true;
UseLoV2 = true;
V2BlendMask[i] = M - NumElements;
BlendMask[i] = SplitNumElements + i;
} else if (M >= 0) {
if (M >= SplitNumElements)
UseHiV1 = true;
UseLoV1 = true;
V1BlendMask[i] = M;
BlendMask[i] = i;
// Because the lowering happens after all combining takes place, we need to
// manually combine these blend masks as much as possible so that we create
// a minimal number of high-level vector shuffle nodes.
// First try just blending the halves of V1 or V2.
if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
return DAG.getUNDEF(SplitVT);
if (!UseLoV2 && !UseHiV2)
return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
if (!UseLoV1 && !UseHiV1)
return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
SDValue V1Blend, V2Blend;
if (UseLoV1 && UseHiV1) {
V1Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
} else {
// We only use half of V1 so map the usage down into the final blend mask.
V1Blend = UseLoV1 ? LoV1 : HiV1;
for (int i = 0; i < SplitNumElements; ++i)
if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
if (UseLoV2 && UseHiV2) {
V2Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
} else {
// We only use half of V2 so map the usage down into the final blend mask.
V2Blend = UseLoV2 ? LoV2 : HiV2;
for (int i = 0; i < SplitNumElements; ++i)
if (BlendMask[i] >= SplitNumElements)
BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
SDValue Lo = HalfBlend(LoMask);
SDValue Hi = HalfBlend(HiMask);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
/// Either split a vector in halves or decompose the shuffles and the
/// blend.
/// This is provided as a good fallback for many lowerings of non-single-input
/// shuffles with more than one 128-bit lane. In those cases, we want to select
/// between splitting the shuffle into 128-bit components and stitching those
/// back together vs. extracting the single-input shuffles and blending those
/// results.
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.");
int Size = Mask.size();
// If this can be modeled as a broadcast of two elements followed by a blend,
// prefer that lowering. This is especially important because broadcasts can
// often fold with memory operands.
auto DoBothBroadcast = [&] {
int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
for (int M : Mask)
if (M >= Size) {
if (V2BroadcastIdx < 0)
V2BroadcastIdx = M - Size;
else if (M - Size != V2BroadcastIdx)
return false;
} else if (M >= 0) {
if (V1BroadcastIdx < 0)
V1BroadcastIdx = M;
else if (M != V1BroadcastIdx)
return false;
return true;
if (DoBothBroadcast())
return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
Subtarget, DAG);
// If the inputs all stem from a single 128-bit lane of each input, then we
// split them rather than blending because the split will decompose to
// unusually few instructions.
int LaneCount = VT.getSizeInBits() / 128;
int LaneSize = Size / LaneCount;
SmallBitVector LaneInputs[2];
LaneInputs[0].resize(LaneCount, false);
LaneInputs[1].resize(LaneCount, false);
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
// Otherwise, just fall back to decomposed shuffles and a blend. This requires
// that the decomposed single-input shuffles don't end up here.
return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
// TODO: Extend to support v8f32 (+ 512-bit shuffles).
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
int LHSMask[4] = {-1, -1, -1, -1};
int RHSMask[4] = {-1, -1, -1, -1};
unsigned SHUFPMask = 0;
// As SHUFPD uses a single LHS/RHS element per lane, we can always
// perform the shuffle once the lanes have been shuffled in place.
for (int i = 0; i != 4; ++i) {
int M = Mask[i];
if (M < 0)
int LaneBase = i & ~1;
auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
LaneMask[LaneBase + (M & 1)] = M;
SHUFPMask |= (M & 1) << i;
SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a lane permutation followed by a per-lane permutation.
/// This is mainly for cases where we can have non-repeating permutes
/// in each lane.
/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
/// we should investigate merging them.
static SDValue lowerShuffleAsLanePermuteAndPermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;
SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i) {
int M = Mask[i];
if (M < 0)
// Ensure that each lane comes from a single source lane.
int SrcLane = M / NumEltsPerLane;
int DstLane = i / NumEltsPerLane;
if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
return SDValue();
SrcLaneMask[DstLane] = SrcLane;
PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
// Make sure we set all elements of the lane mask, to avoid undef propagation.
SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
int SrcLane = SrcLaneMask[DstLane];
if (0 <= SrcLane)
for (int j = 0; j != NumEltsPerLane; ++j) {
LaneMask[(DstLane * NumEltsPerLane) + j] =
(SrcLane * NumEltsPerLane) + j;
// If we're only shuffling a single lowest lane and the rest are identity
// then don't bother.
// TODO - isShuffleMaskInputInPlace could be extended to something like this.
int NumIdentityLanes = 0;
bool OnlyShuffleLowestLane = true;
for (int i = 0; i != NumLanes; ++i) {
if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
i * NumEltsPerLane))
else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
OnlyShuffleLowestLane = false;
if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
return SDValue();
SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
/// source with a lane permutation.
/// This lowering strategy results in four instructions in the worst case for a
/// single-input cross lane shuffle which is lower than any other fully general
/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
/// shuffle pattern should be handled prior to trying this lowering.
static SDValue lowerShuffleAsLanePermuteAndShuffle(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
int Size = Mask.size();
int LaneSize = Size / 2;
// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
// Only do this if the elements aren't all from the lower lane,
// otherwise we're (probably) better off doing a split.
if (VT == MVT::v4f64 &&
!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
if (SDValue V =
lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
return V;
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
if (!Subtarget.hasAVX2()) {
bool LaneCrossing[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
} else {
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
LaneUsed[(Mask[i] % Size) / LaneSize] = true;
if (!LaneUsed[0] || !LaneUsed[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
// TODO - we could support shuffling V2 in the Flipped input.
assert(V2.isUndef() &&
"This last part of this routine only works on single input shuffles");
SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
for (int i = 0; i < Size; ++i) {
int &M = InLaneMask[i];
if (M < 0)
if (((M % Size) / LaneSize) != (i / LaneSize))
M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected");
// Flip the lanes, and shuffle the results which should now be in-lane.
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
SDValue Flipped = DAG.getBitcast(PVT, V1);
Flipped =
DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
Flipped = DAG.getBitcast(VT, Flipped);
return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
/// Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
SmallVector<int, 4> WidenedMask;
if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
return SDValue();
bool IsLowZero = (Zeroable & 0x3) == 0x3;
bool IsHighZero = (Zeroable & 0xc) == 0xc;
// Try to use an insert into a zero vector.
if (WidenedMask[0] == 0 && IsHighZero) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
DAG.getIntPtrConstant(0, DL));
getZeroVector(VT, Subtarget, DAG, DL), LoV,
DAG.getIntPtrConstant(0, DL));
// TODO: If minimizing size and one of the inputs is a zero vector and the
// the zero vector has only one use, we could use a VPERM2X128 to save the
// instruction bytes needed to explicitly generate the zero vector.
// Blends are faster and handle all the non-lane-crossing cases.
if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return Blend;
// If either input operand is a zero vector, use VPERM2X128 because its mask
// allows us to replace the zero input with an implicit zero.
if (!IsLowZero && !IsHighZero) {
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
// this will likely become vinsertf128 which can't fold a 256-bit memop.
if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
OnlyUsesV1 ? V1 : V2,
DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
DAG.getIntPtrConstant(2, DL));
// Try to use SHUF128 if possible.
if (Subtarget.hasVLX()) {
if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
((WidenedMask[1] % 2) << 1);
return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
DAG.getTargetConstant(PermMask, DL, MVT::i8));
// Otherwise form a 128-bit permutation. After accounting for undefs,
// convert the 64-bit shuffle mask selection values into 128-bit
// selection bits by dividing the indexes by 2 and shifting into positions
// defined by a vperm2*128 instruction's immediate control byte.
// The immediate permute control byte looks like this:
// [1:0] - select 128 bits from sources for low half of destination
// [2] - ignore
// [3] - zero low half of destination
// [5:4] - select 128 bits from sources for high half of destination
// [6] - ignore
// [7] - zero high half of destination
assert((WidenedMask[0] >= 0 || IsLowZero) &&
(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
unsigned PermMask = 0;
PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
// Check the immediate mask and replace unused sources with undef.
if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
V1 = DAG.getUNDEF(VT);
if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
V2 = DAG.getUNDEF(VT);
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
DAG.getTargetConstant(PermMask, DL, MVT::i8));
/// Lower a vector shuffle by first fixing the 128-bit lanes and then
/// shuffling each lane.
/// This attempts to create a repeated lane shuffle where each lane uses one
/// or two of the lanes of the inputs. The lanes of the input vectors are
/// shuffled in one or two independent shuffles to get the lanes into the
/// position needed by the final shuffle.
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
assert(!V2.isUndef() && "This is only useful with multiple inputs.");
if (is128BitLaneRepeatedShuffleMask(VT, Mask))
return SDValue();
int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = 128 / VT.getScalarSizeInBits();
SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
// First pass will try to fill in the RepeatMask from lanes that need two
// sources.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Srcs[2] = {-1, -1};
SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
for (int i = 0; i != NumLaneElts; ++i) {
int M = Mask[(Lane * NumLaneElts) + i];
if (M < 0)
// Determine which of the possible input lanes (NumLanes from each source)
// this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything.
int LaneSrc = M / NumLaneElts;
int Src;
if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
Src = 0;
else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
Src = 1;
return SDValue();
Srcs[Src] = LaneSrc;
InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
// If this lane has two sources, see if it fits with the repeat mask so far.
if (Srcs[1] < 0)
LaneSrcs[Lane][0] = Srcs[0];
LaneSrcs[Lane][1] = Srcs[1];
auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
assert(M1.size() == M2.size() && "Unexpected mask size");
for (int i = 0, e = M1.size(); i != e; ++i)
if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
return false;
return true;
auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
for (int i = 0, e = MergedMask.size(); i != e; ++i) {
int M = Mask[i];
if (M < 0)
assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
"Unexpected mask element");
MergedMask[i] = M;
if (MatchMasks(InLaneMask, RepeatMask)) {
// Merge this lane mask into the final repeat mask.
MergeMasks(InLaneMask, RepeatMask);
// Didn't find a match. Swap the operands and try again.
std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
if (MatchMasks(InLaneMask, RepeatMask)) {
// Merge this lane mask into the final repeat mask.
MergeMasks(InLaneMask, RepeatMask);
// Couldn't find a match with the operands in either order.
return SDValue();
// Now handle any lanes with only one source.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
// If this lane has already been processed, skip it.
if (LaneSrcs[Lane][0] >= 0)
for (int i = 0; i != NumLaneElts; ++i) {
int M = Mask[(Lane * NumLaneElts) + i];
if (M < 0)
// If RepeatMask isn't defined yet we can define it ourself.
if (RepeatMask[i] < 0)
RepeatMask[i] = M % NumLaneElts;
if (RepeatMask[i] < NumElts) {
if (RepeatMask[i] != M % NumLaneElts)
return SDValue();
LaneSrcs[Lane][0] = M / NumLaneElts;
} else {
if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
return SDValue();
LaneSrcs[Lane][1] = M / NumLaneElts;
if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
return SDValue();
SmallVector<int, 16> NewMask(NumElts, -1);
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][0];
for (int i = 0; i != NumLaneElts; ++i) {
int M = -1;
if (Src >= 0)
M = Src * NumLaneElts + i;
NewMask[Lane * NumLaneElts + i] = M;
SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
// Ensure we didn't get back the shuffle we started with.
// FIXME: This is a hack to make up for some splat handling code in
// getVectorShuffle.
if (isa<ShuffleVectorSDNode>(NewV1) &&
cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
return SDValue();
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][1];
for (int i = 0; i != NumLaneElts; ++i) {
int M = -1;
if (Src >= 0)
M = Src * NumLaneElts + i;
NewMask[Lane * NumLaneElts + i] = M;
SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
// Ensure we didn't get back the shuffle we started with.
// FIXME: This is a hack to make up for some splat handling code in
// getVectorShuffle.
if (isa<ShuffleVectorSDNode>(NewV2) &&
cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
return SDValue();
for (int i = 0; i != NumElts; ++i) {
NewMask[i] = RepeatMask[i % NumLaneElts];
if (NewMask[i] < 0)
NewMask[i] += (i / NumLaneElts) * NumLaneElts;
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
/// If the input shuffle mask results in a vector that is undefined in all upper
/// or lower half elements and that mask accesses only 2 halves of the
/// shuffle's operands, return true. A mask of half the width with mask indexes
/// adjusted to access the extracted halves of the original shuffle operands is
/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
/// lower half of each input operand is accessed.
static bool
getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
int &HalfIdx1, int &HalfIdx2) {
assert((Mask.size() == HalfMask.size() * 2) &&
"Expected input mask to be twice as long as output");
// Exactly one half of the result must be undef to allow narrowing.
bool UndefLower = isUndefLowerHalf(Mask);
bool UndefUpper = isUndefUpperHalf(Mask);
if (UndefLower == UndefUpper)
return false;
unsigned HalfNumElts = HalfMask.size();
unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
HalfIdx1 = -1;
HalfIdx2 = -1;
for (unsigned i = 0; i != HalfNumElts; ++i) {
int M = Mask[i + MaskIndexOffset];
if (M < 0) {
HalfMask[i] = M;
// Determine which of the 4 half vectors this element is from.
// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
int HalfIdx = M / HalfNumElts;
// Determine the element index into its half vector source.
int HalfElt = M % HalfNumElts;
// We can shuffle with up to 2 half vectors, set the new 'half'
// shuffle mask accordingly.
if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
HalfMask[i] = HalfElt;
HalfIdx1 = HalfIdx;
if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
HalfMask[i] = HalfElt + HalfNumElts;
HalfIdx2 = HalfIdx;
// Too many half vectors referenced.
return false;
return true;
/// Given the output values from getHalfShuffleMask(), create a half width
/// shuffle of extracted vectors followed by an insert back to full width.
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
ArrayRef<int> HalfMask, int HalfIdx1,
int HalfIdx2, bool UndefLower,
SelectionDAG &DAG, bool UseConcat = false) {
assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
assert(V1.getValueType().isSimple() && "Expecting only simple types");
MVT VT = V1.getSimpleValueType();
MVT HalfVT = VT.getHalfNumVectorElementsVT();
unsigned HalfNumElts = HalfVT.getVectorNumElements();
auto getHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
return DAG.getUNDEF(HalfVT);
SDValue V = (HalfIdx < 2 ? V1 : V2);
HalfIdx = (HalfIdx % 2) * HalfNumElts;
DAG.getIntPtrConstant(HalfIdx, DL));
// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
SDValue Half1 = getHalfVector(HalfIdx1);
SDValue Half2 = getHalfVector(HalfIdx2);
SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
if (UseConcat) {
SDValue Op0 = V;
SDValue Op1 = DAG.getUNDEF(HalfVT);
if (UndefLower)
std::swap(Op0, Op1);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
unsigned Offset = UndefLower ? HalfNumElts : 0;
DAG.getIntPtrConstant(Offset, DL));
/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
/// This allows for fast cases such as subvector extraction/insertion
/// or shuffling smaller vector types which can lower more efficiently.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert((VT.is256BitVector() || VT.is512BitVector()) &&
"Expected 256-bit or 512-bit vector");
bool UndefLower = isUndefLowerHalf(Mask);
if (!UndefLower && !isUndefUpperHalf(Mask))
return SDValue();
assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
"Completely undef shuffle mask should have been simplified already");
// Upper half is undef and lower half is whole upper subvector.
// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
MVT HalfVT = VT.getHalfNumVectorElementsVT();
unsigned HalfNumElts = HalfVT.getVectorNumElements();
if (!UndefLower &&
isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
DAG.getIntPtrConstant(HalfNumElts, DL));
DAG.getIntPtrConstant(0, DL));
// Lower half is undef and upper half is whole lower subvector.
// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
if (UndefLower &&
isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
DAG.getIntPtrConstant(0, DL));
DAG.getIntPtrConstant(HalfNumElts, DL));
int HalfIdx1, HalfIdx2;
SmallVector<int, 8> HalfMask(HalfNumElts);
if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
return SDValue();
assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
// Only shuffle the halves of the inputs when useful.
unsigned NumLowerHalves =
(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
unsigned NumUpperHalves =
(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
// Determine the larger pattern of undef/halves, then decide if it's worth
// splitting the shuffle based on subtarget capabilities and types.
unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
if (!UndefLower) {
// XXXXuuuu: no insert is needed.
// Always extract lowers when setting lower - these are all free subreg ops.
if (NumUpperHalves == 0)
return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
UndefLower, DAG);
if (NumUpperHalves == 1) {
// AVX2 has efficient 32/64-bit element cross-lane shuffles.
if (Subtarget.hasAVX2()) {
// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
!is128BitUnpackShuffleMask(HalfMask) &&
(!isSingleSHUFPSMask(HalfMask) ||
return SDValue();
// If this is a unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
// are better off extracting the upper half of 1 operand and using a
// narrow shuffle.
if (EltWidth == 64 && V2.isUndef())
return SDValue();
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
if (Subtarget.hasAVX512() && VT.is512BitVector())
return SDValue();
// Extract + narrow shuffle is better than the wide alternative.
return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
UndefLower, DAG);
// Don't extract both uppers, instead shuffle and then extract.
assert(NumUpperHalves == 2 && "Half vector count went wrong");
return SDValue();
// UndefLower - uuuuXXXX: an insert to high half is required if we split this.
if (NumUpperHalves == 0) {
// AVX2 has efficient 64-bit element cross-lane shuffles.
// TODO: Refine to account for unary shuffle, splat, and other masks?
if (Subtarget.hasAVX2() && EltWidth == 64)
return SDValue();
// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
if (Subtarget.hasAVX512() && VT.is512BitVector())
return SDValue();
// Narrow shuffle + insert is better than the wide alternative.
return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
UndefLower, DAG);
// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
return SDValue();
/// Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
/// This returns true if the elements from a particular input are already in the
/// slot required by the given mask and require no permutation.
static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
int Size = Mask.size();
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
return false;
return true;
/// Handle case where shuffle sources are coming from the same 128-bit lane and
/// every lane can be represented as the same repeating mask - allowing us to
/// shuffle the sources with the repeating shuffle and then permute the result
/// to the destination lanes.
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits() / 128;
int NumLaneElts = NumElts / NumLanes;
// On AVX2 we may be able to just shuffle the lowest elements and then
// broadcast the result.
if (Subtarget.hasAVX2()) {
for (unsigned BroadcastSize : {16, 32, 64}) {
if (BroadcastSize <= VT.getScalarSizeInBits())
int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
// Attempt to match a repeating pattern every NumBroadcastElts,
// accounting for UNDEFs but only references the lowest 128-bit
// lane of the inputs.
auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
for (int i = 0; i != NumElts; i += NumBroadcastElts)
for (int j = 0; j != NumBroadcastElts; ++j) {
int M = Mask[i + j];
if (M < 0)
int &R = RepeatMask[j];
if (0 != ((M % NumElts) / NumLaneElts))
return false;
if (0 <= R && R != M)
return false;
R = M;
return true;
SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
if (!FindRepeatingBroadcastMask(RepeatMask))
// Shuffle the (lowest) repeated elements in place for broadcast.
SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
// Shuffle the actual broadcast.
SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
for (int i = 0; i != NumElts; i += NumBroadcastElts)
for (int j = 0; j != NumBroadcastElts; ++j)
BroadcastMask[i + j] = j;
return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
// Bail if the shuffle mask doesn't cross 128-bit lanes.
if (!is128BitLaneCrossingShuffleMask(VT, Mask))
return SDValue();
// Bail if we already have a repeated lane shuffle mask.
SmallVector<int, 8> RepeatedShuffleMask;
if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
return SDValue();
// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
int NumSubLanes = NumLanes * SubLaneScale;
int NumSubLaneElts = NumLaneElts / SubLaneScale;
// Check that all the sources are coming from the same lane and see if we can
// form a repeating shuffle mask (local to each sub-lane). At the same time,
// determine the source sub-lane for each destination sub-lane.
int TopSrcSubLane = -1;
SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
// Extract the sub-lane mask, check that it all comes from the same lane
// and normalize the mask entries to come from the first lane.
int SrcLane = -1;
SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
if (M < 0)
int Lane = (M % NumElts) / NumLaneElts;
if ((0 <= SrcLane) && (SrcLane != Lane))
return SDValue();
SrcLane = Lane;
int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
SubLaneMask[Elt] = LocalM;
// Whole sub-lane is UNDEF.
if (SrcLane < 0)
// Attempt to match against the candidate repeated sub-lane masks.
for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
for (int i = 0; i != NumSubLaneElts; ++i) {
if (M1[i] < 0 || M2[i] < 0)
if (M1[i] != M2[i])
return false;
return true;
auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
// Merge the sub-lane mask into the matching repeated sub-lane mask.
for (int i = 0; i != NumSubLaneElts; ++i) {
int M = SubLaneMask[i];
if (M < 0)
assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
"Unexpected mask element");
RepeatedSubLaneMask[i] = M;
// Track the top most source sub-lane - by setting the remaining to UNDEF
// we can greatly simplify shuffle matching.
int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
// Bail if we failed to find a matching repeated sub-lane mask.
if (Dst2SrcSubLanes[DstSubLane] < 0)
return SDValue();
assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
"Unexpected source lane");
// Create a repeating shuffle mask for the entire vector.
SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
int Lane = SubLane / SubLaneScale;
auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
int M = RepeatedSubLaneMask[Elt];
if (M < 0)
int Idx = (SubLane * NumSubLaneElts) + Elt;
RepeatedMask[Idx] = M + (Lane * NumLaneElts);
SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
// Shuffle each source sub-lane to its destination.
SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
for (int i = 0; i != NumElts; i += NumSubLaneElts) {
int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
if (SrcSubLane < 0)
for (int j = 0; j != NumSubLaneElts; ++j)
SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
bool &ForceV1Zero, bool &ForceV2Zero,
unsigned &ShuffleImm, ArrayRef<int> Mask,
const APInt &Zeroable) {
int NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
"Unexpected data type for VSHUFPD");
assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
"Illegal shuffle mask");
bool ZeroLane[2] = { true, true };
for (int i = 0; i < NumElts; ++i)
ZeroLane[i & 1] &= Zeroable[i];
// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
ShuffleImm = 0;
bool ShufpdMask = true;
bool CommutableMask = true;
for (int i = 0; i < NumElts; ++i) {
if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
if (Mask[i] < 0)
return false;
int Val = (i & 6) + NumElts * (i & 1);
int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
if (Mask[i] < Val || Mask[i] > Val + 1)
ShufpdMask = false;
if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
CommutableMask = false;
ShuffleImm |= (Mask[i] % 2) << i;
if (!ShufpdMask && !CommutableMask)
return false;
if (!ShufpdMask && CommutableMask)
std::swap(V1, V2);
ForceV1Zero = ZeroLane[0];
ForceV2Zero = ZeroLane[1];
return true;
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD");
unsigned Immediate = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
Mask, Zeroable))
return SDValue();
// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
if (ForceV1Zero)
V1 = getZeroVector(VT, Subtarget, DAG, DL);
if (ForceV2Zero)
V2 = getZeroVector(VT, Subtarget, DAG, DL);
return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
DAG.getTargetConstant(Immediate, DL, MVT::i8));
// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
// by zeroable elements in the remaining 24 elements. Turn this into two
// vmovqb instructions shuffled together.
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
const APInt &Zeroable,
SelectionDAG &DAG) {
assert(VT == MVT::v32i8 && "Unexpected type!");
// The first 8 indices should be every 8th element.
if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
return SDValue();
// Remaining elements need to be zeroable.
if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
return SDValue();
V1 = DAG.getBitcast(MVT::v4i64, V1);
V2 = DAG.getBitcast(MVT::v4i64, V2);
V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
// the upper bits of the result using an unpckldq.
SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
{ 0, 1, 2, 3, 16, 17, 18, 19,
4, 5, 6, 7, 20, 21, 22, 23 });
// Insert the unpckldq into a zero vector to widen to v32i8.
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
DAG.getConstant(0, DL, MVT::v32i8), Unpack,
DAG.getIntPtrConstant(0, DL));
/// Handle lowering of 4-lane 64-bit floating point shuffles.
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
Mask, Subtarget, DAG))
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowered with an
// interleaved permutation.
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
// With AVX2 we have direct support for this permutation.
if (Subtarget.hasAVX2())
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
// Try to permute the lanes and then use a per-lane permute.
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
Mask, DAG, Subtarget))
return V;
// Otherwise, fall back.
return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
DAG, Subtarget);
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Op;
// If we have lane crossing shuffles AND they don't all come from the lower
// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
// canonicalize to a blend of splat which isn't necessary for this combine.
if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
(V1.getOpcode() != ISD::BUILD_VECTOR) &&
(V2.getOpcode() != ISD::BUILD_VECTOR))
if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
Mask, DAG))
return Op;
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
isShuffleMaskInputInPlace(1, Mask))))
if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return V;
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
/// Handle lowering of 4-lane 64-bit integer shuffles.
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
if (V2.isUndef()) {
// When the shuffle is mirrored between the 128-bit lanes of the unit, we
// can use lower latency instructions that will operate on both lanes.
SmallVector<int, 2> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
SmallVector<int, 4> PSHUFDMask;
narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
return DAG.getBitcast(
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
DAG.getBitcast(MVT::v8i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
// AVX2 provides a direct instruction for permuting a single input across
// lanes.
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or VEXPAND.
if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
// Try to use PALIGNR.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
return V;
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return V;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
// instruction so skip this pattern.
if (!isShuffleMaskInputInPlace(0, Mask) &&
!isShuffleMaskInputInPlace(1, Mask))
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
/// Handle lowering of 8-lane 32-bit floating point shuffles.
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!");
// Use even/odd duplicate instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
return V;
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends.
return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return V;
// If we have a single input shuffle with different shuffle patterns in the
// two 128-bit lanes use the variable mask to VPERMILPS.
if (V2.isUndef()) {
if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
if (Subtarget.hasAVX2()) {
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
// Otherwise, fall back.
return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
DAG, Subtarget);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Result;
// If we have VLX support, we can use VEXPAND.
if (Subtarget.hasVLX())
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG))
return V;
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG);
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG);
/// Handle lowering of 8-lane 32-bit integer shuffles.
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
// since after split we get a more efficient code than vblend by using
// vpunpcklwd and vpunpckhwd instrs.
if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
// If the shuffle mask is repeated in each 128-bit lane we can use more
// efficient instructions that mirror the shuffles across the two 128-bit
// lanes.
SmallVector<int, 4> RepeatedMask;
bool Is128BitLaneRepeatedShuffle =
is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
if (Is128BitLaneRepeatedShuffle) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
if (V2.isUndef())
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// If we have VLX support, we can use VALIGN or EXPAND.
if (Subtarget.hasVLX()) {
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return V;
if (V2.isUndef()) {
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
return V;
// If the shuffle patterns aren't repeated but it's a single input, directly
// generate a cross-lane VPERMD instruction.
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v8i32, ShufPS);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return Result;
// Otherwise fall back on generic blend lowering.
return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG);
/// Handle lowering of 16-lane 16-bit integer shuffles.
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return V;
if (V2.isUndef()) {
// Try to use bit rotation instructions.
if (SDValue Rotate =
lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
return Rotate;
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
DAG, Subtarget);
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v16 case.
return lowerV8I16GeneralSingleInputShuffle(
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512BWVL can lower to VPERMW.
if (Subtarget.hasBWI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG);
/// Handle lowering of 32-lane 8-bit integer shuffles.
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return ZExt;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to use bit rotation instructions.
if (V2.isUndef())
if (SDValue Rotate =
lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
return Rotate;
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return V;
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
DAG, Subtarget);
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
// AVX512VBMIVL can lower to VPERMB.
if (Subtarget.hasVBMI() && Subtarget.hasVLX())
return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return Result;
// Try to permute the lanes and then use a per-lane permute.
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
// by zeroable elements in the remaining 24 elements. Turn this into two
// vmovqb instructions shuffled together.
if (Subtarget.hasVLX())
if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
Mask, Zeroable, DAG))
return V;
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG);
/// High-level routine to lower various 256-bit x86 vector shuffles.
/// This routine either breaks down the specific type of a 256-bit x86 vector
/// shuffle or splits it into two 128-bit shuffles and fuses the results back
/// together based on the available instructions.
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
SDValue V1, SDValue V2, const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = VT.getVectorNumElements();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// There is a really nice hard cut-over between AVX1 and AVX2 that means we
// can check for those subtargets here and avoid much of the subtarget
// querying in the per-vector-type lowering routines. With AVX1 we have
// essentially *zero* ability to manipulate a 256-bit vector with integer
// types. Since we'll use floating point types there eventually, just
// immediately cast everything to a float and operate entirely in that domain.
if (VT.isInteger() && !Subtarget.hasAVX2()) {
int ElementBits = VT.getScalarSizeInBits();
if (ElementBits < 32) {
// No floating point type available, if we can't use the bit operations
// for masking/blending then decompose into 128-bit vectors.
if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
V1 = DAG.getBitcast(FpVT, V1);
V2 = DAG.getBitcast(FpVT, V2);
return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
switch (VT.SimpleTy) {
case MVT::v4f64:
return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v4i64:
return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8f32:
return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i32:
return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i16:
return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i8:
return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
llvm_unreachable("Not a valid 256-bit x86 vector type!");
/// Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.");
// To handle 256 bit vector requires VLX and most probably
// function lowerV2X128VectorShuffle() is better solution.
assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
SmallVector<int, 4> Widened128Mask;
if (!canWidenShuffleElements(Mask, Widened128Mask))
return SDValue();
assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
// Try to use an insert into a zero vector.
if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
(Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
DAG.getIntPtrConstant(0, DL));
getZeroVector(VT, Subtarget, DAG, DL), LoV,
DAG.getIntPtrConstant(0, DL));
// Check for patterns which can be matched with a single insert of a 256-bit
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
if (OnlyUsesV1 ||
isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
SDValue SubVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
DAG.getIntPtrConstant(4, DL));
// See if this is an insertion of the lower 128-bits of V2 into V1.
bool IsInsert = true;
int V2Index = -1;
for (int i = 0; i < 4; ++i) {
assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
if (Widened128Mask[i] < 0)
// Make sure all V1 subvectors are in place.
if (Widened128Mask[i] < 4) {
if (Widened128Mask[i] != i) {
IsInsert = false;
} else {
// Make sure we only have a single V2 index and its the lowest 128-bits.
if (V2Index >= 0 || Widened128Mask[i] != 4) {
IsInsert = false;
V2Index = i;
if (IsInsert && V2Index >= 0) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
DAG.getIntPtrConstant(0, DL));
return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
// possible we at least ensure the lanes stay sequential to help later
// combines.
SmallVector<int, 2> Widened256Mask;
if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
// Try to lower to vshuf64x2/vshuf32x4.
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
unsigned PermMask = 0;
// Insure elements came from the same Op.
for (int i = 0; i < 4; ++i) {
assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
if (Widened128Mask[i] < 0)
SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
unsigned OpIndex = i / 2;
if (Ops[OpIndex].isUndef())
Ops[OpIndex] = Op;
else if (Ops[OpIndex] != Op)
return SDValue();
// Convert the 128-bit shuffle mask selection values into 128-bit selection
// bits defined by a vshuf64x2 instruction's immediate control byte.
PermMask |= (Widened128Mask[i] % 4) << (i * 2);
return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
DAG.getTargetConstant(PermMask, DL, MVT::i8));
/// Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (V2.isUndef()) {
// Use low duplicate instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
// Non-half-crossing single input shuffles can be lowered with an
// interleaved permutation.
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
V2, Subtarget, DAG))
return Shuf128;
if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Unpck;
// Check if the blend happens to exactly fit that of SHUFPD.
if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Op;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
/// Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
// Use even/odd duplicate instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
if (V2.isUndef())
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Otherwise, fall back to a SHUFPS sequence.
return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
return V;
// If we have a single input shuffle with different shuffle patterns in the
// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
if (V2.isUndef() &&
!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
V1, V2, DAG, Subtarget))
return V;
return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
/// Handle lowering of 8-lane 64-bit integer shuffles.
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
if (V2.isUndef()) {
// When the shuffle is mirrored between the 128-bit lanes of the unit, we
// can use lower latency instructions that will operate on all four
// 128-bit lanes.
SmallVector<int, 2> Repeated128Mask;
if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
SmallVector<int, 4> PSHUFDMask;
narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
return DAG.getBitcast(
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
DAG.getBitcast(MVT::v16i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
SmallVector<int, 4> Repeated256Mask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
V2, Subtarget, DAG))
return Shuf128;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to use PALIGNR.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
/// Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// If the shuffle mask is repeated in each 128-bit lane we can use more
// efficient instructions that mirror the shuffles across the four 128-bit
// lanes.
SmallVector<int, 4> RepeatedMask;
bool Is128BitLaneRepeatedShuffle =
is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
if (Is128BitLaneRepeatedShuffle) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
if (V2.isUndef())
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use VALIGN.
if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to use byte rotation instructions.
if (Subtarget.hasBWI())
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Assume that a single SHUFPS is faster than using a permv shuffle.
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
return V;
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
DAG, Subtarget))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
/// Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V =
lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
if (V2.isUndef()) {
// Try to use bit rotation instructions.
if (SDValue Rotate =
lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
return Rotate;
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v32 case.
return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
RepeatedMask, Subtarget, DAG);
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
/// Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
const APInt &Zeroable, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
return V;
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
// Try to use bit rotation instructions.
if (V2.isUndef())
if (SDValue Rotate =
lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
return Rotate;
// Lower as AND if possible.
if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Masked;
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (!V2.isUndef())
if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return Result;
// FIXME: Implement direct support for this type!
return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
/// High-level routine to lower various 512-bit x86 vector shuffles.
/// This routine either breaks down the specific type of a 512-bit x86 vector
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
/// together based on the available instructions.
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
int NumElts = Mask.size();
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
if (NumV2Elements == 1 && Mask[0] >= NumElts)
if (SDValue Insertion = lowerShuffleAsElementInsertion(
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
// Handle special cases where the lower or upper half is UNDEF.
if (SDValue V =
lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
Subtarget, DAG))
return Broadcast;
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
// Try using bit ops for masking and blending before falling back to
// splitting.
if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
// Dispatch to each element type for lowering. If we don't have support for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16f32:
return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v8i64:
return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v16i32:
return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v32i16:
return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
case MVT::v64i8:
return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
llvm_unreachable("Not a valid 512-bit x86 vector type!");
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// Shuffle should be unary.
if (!V2.isUndef())
return SDValue();
int ShiftAmt = -1;
int NumElts = Mask.size();
for (int i = 0; i != NumElts; ++i) {
int M = Mask[i];
assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
"Unexpected mask index.");
if (M < 0)
// The first non-undef element determines our shift amount.
if (ShiftAmt < 0) {
ShiftAmt = M - i;
// Need to be shifting right.
if (ShiftAmt <= 0)
return SDValue();
// All non-undef elements must shift by the same amount.
if (ShiftAmt != M - i)
return SDValue();
assert(ShiftAmt >= 0 && "All undef?");
// Great we found a shift right.
MVT WideVT = VT;
if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
DAG.getUNDEF(WideVT), V1,
DAG.getIntPtrConstant(0, DL));
Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
DAG.getIntPtrConstant(0, DL));
// Determine if this shuffle can be implemented with a KSHIFT instruction.
// Returns the shift amount if possible or -1 if not. This is a simplified
// version of matchShuffleAsShift.
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
int MaskOffset, const APInt &Zeroable) {
int Size = Mask.size();
auto CheckZeros = [&](int Shift, bool Left) {
for (int j = 0; j < Shift; ++j)
if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
return false;
return true;
auto MatchShift = [&](int Shift, bool Left) {
unsigned Pos = Left ? Shift : 0;
unsigned Low = Left ? 0 : Shift;
unsigned Len = Size - Shift;
return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
for (int Shift = 1; Shift != Size; ++Shift)
for (bool Left : {true, false})
if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
return Shift;
return -1;
// Lower vXi1 vector shuffles.
// There is no a dedicated instruction on AVX-512 that shuffles the masks.
// The only way to shuffle bits is to sign-extend the mask vector to SIMD
// vector, shuffle and then truncate it back.
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
int NumElts = Mask.size();
// Try to recognize shuffles that are just padding a subvector with zeros.
int SubvecElts = 0;
int Src = -1;
for (int i = 0; i != NumElts; ++i) {
if (Mask[i] >= 0) {
// Grab the source from the first valid mask. All subsequent elements need
// to use this same source.
if (Src < 0)
Src = Mask[i] / NumElts;
if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
assert(SubvecElts != NumElts && "Identity shuffle?");
// Clip to a power 2.
SubvecElts = PowerOf2Floor(SubvecElts);
// Make sure the number of zeroable bits in the top at least covers the bits
// not covered by the subvector.
if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
assert(Src >= 0 && "Expected a source!");
MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
Src == 0 ? V1 : V2,
DAG.getIntPtrConstant(0, DL));
DAG.getConstant(0, DL, VT),
Extract, DAG.getIntPtrConstant(0, DL));
// Try a simple shift right with undef elements. Later we'll try with zeros.
if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
return Shift;
// Try to match KSHIFTs.
unsigned Offset = 0;
for (SDValue V : { V1, V2 }) {
unsigned Opcode;
int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
if (ShiftAmt >= 0) {
MVT WideVT = VT;
if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
DAG.getUNDEF(WideVT), V,
DAG.getIntPtrConstant(0, DL));
// Widened right shifts need two shifts to ensure we shift in zeroes.
if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
int WideElts = WideVT.getVectorNumElements();
// Shift left to put the original vector in the MSBs of the new size.
Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
// Increase the shift amount to account for the left shift.
ShiftAmt += WideElts - NumElts;
Res = DAG.getNode(Opcode, DL, WideVT, Res,
DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
DAG.getIntPtrConstant(0, DL));
Offset += NumElts; // Increment for next iteration.
switch (VT.SimpleTy) {
llvm_unreachable("Expected a vector of i1 elements");
case MVT::v2i1:
ExtVT = MVT::v2i64;
case MVT::v4i1:
ExtVT = MVT::v4i32;
case MVT::v8i1:
// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
// shuffle.
ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
case MVT::v16i1:
// Take 512-bit type, unless we are avoiding 512-bit types and have the
// 256-bit operation available.
ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
case MVT::v32i1:
// Take 512-bit type, unless we are avoiding 512-bit types and have the
// 256-bit operation available.
assert(Subtarget.hasBWI() && "Expected AVX512BW support");
ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
case MVT::v64i1:
// Fall back to scalarization. FIXME: We can do better if the shuffle
// can be partitioned cleanly.
if (!Subtarget.useBWIRegs())
return SDValue();
ExtVT = MVT::v64i8;
V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
// i1 was sign extended we can use X86ISD::CVT2MASK.
int NumElems = VT.getVectorNumElements();
if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
(Subtarget.hasDQI() && (NumElems < 32)))
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
Shuffle, ISD::SETGT);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
/// Helper function that returns true if the shuffle mask should be
/// commuted to improve canonicalization.
static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
int NumElements = Mask.size();
int NumV1Elements = 0, NumV2Elements = 0;
for (int M : Mask)
if (M < 0)
else if (M < NumElements)
// Commute the shuffle as needed such that more elements come from V1 than
// V2. This allows us to match the shuffle pattern strictly on how many
// elements come from V1 without handling the symmetric cases.
if (NumV2Elements > NumV1Elements)
return true;
assert(NumV1Elements > 0 && "No V1 indices");
if (NumV2Elements == 0)
return false;
// When the number of V1 and V2 elements are the same, try to minimize the
// number of uses of V2 in the low half of the vector. When that is tied,
// ensure that the sum of indices for V1 is equal to or lower than the sum
// indices for V2. When those are equal, try to ensure that the number of odd
// indices for V1 is lower than the number of odd indices for V2.
if (NumV1Elements == NumV2Elements) {
int LowV1Elements = 0, LowV2Elements = 0;
for (int M : Mask.slice(0, NumElements / 2))
if (M >= NumElements)
else if (M >= 0)
if (LowV2Elements > LowV1Elements)
return true;
if (LowV2Elements == LowV1Elements) {
int SumV1Indices = 0, SumV2Indices = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (Mask[i] >= NumElements)
SumV2Indices += i;
else if (Mask[i] >= 0)
SumV1Indices += i;
if (SumV2Indices < SumV1Indices)
return true;
if (SumV2Indices == SumV1Indices) {
int NumV1OddIndices = 0, NumV2OddIndices = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
if (Mask[i] >= NumElements)
NumV2OddIndices += i % 2;
else if (Mask[i] >= 0)
NumV1OddIndices += i % 2;
if (NumV2OddIndices < NumV1OddIndices)
return true;
return false;
/// Top-level lowering for x86 vector shuffles.
/// This handles decomposition, canonicalization, and lowering of all x86
/// vector shuffles. Most of the specific lowering strategies are encapsulated
/// above in helper routines. The canonicalization attempts to widen shuffles
/// to involve fewer lanes of wider elements, consolidate symmetric patterns
/// s.t. only one of the two inputs needs to be tested, etc.
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> OrigMask = SVOp->getMask();
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
MVT VT = Op.getSimpleValueType();
int NumElements = VT.getVectorNumElements();
SDLoc DL(Op);
bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
"Can't lower MMX shuffles");
bool V1IsUndef = V1.isUndef();
bool V2IsUndef = V2.isUndef();
if (V1IsUndef && V2IsUndef)
return DAG.getUNDEF(VT);
// When we create a shuffle node we put the UNDEF node to second operand,
// but in some cases the first operand may be transformed to UNDEF.
// In this case we should just commute the node.
if (V1IsUndef)
return DAG.getCommutedVectorShuffle(*SVOp);
// Check for non-undef masks pointing at an undef vector and make the masks
// undef as well. This makes it easier to match the shuffle based solely on
// the mask.
if (V2IsUndef &&
any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
for (int &M : NewMask)
if (M >= NumElements)
M = -1;
return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
// Check for illegal shuffle mask element index values.
int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index");
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
APInt KnownUndef, KnownZero;
computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
APInt Zeroable = KnownUndef | KnownZero;
if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
// Shuffle mask widening should not interfere with a broadcast opportunity
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
Subtarget, DAG))
return Broadcast;
MVT NewEltVT = VT.isFloatingPoint()
? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
int NewNumElts = NumElements / 2;
MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
// Make sure that the new vector type is legal. For example, v2f64 isn't
// legal on SSE1.
if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
if (V2IsZero) {
// Modify the new Mask to take all zeros from the all-zero vector.
// Choose indices that are blend-friendly.
bool UsedZeroVector = false;
assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!");
for (int i = 0; i != NewNumElts; ++i)
if (WidenedMask[i] == SM_SentinelZero) {
WidenedMask[i] = i + NewNumElts;
UsedZeroVector = true;
// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
// some elements to be undef.
if (UsedZeroVector)
V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
V1 = DAG.getBitcast(NewVT, V1);
V2 = DAG.getBitcast(NewVT, V2);
return DAG.getBitcast(
VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
// Commute the shuffle if it will improve canonicalization.
SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
if (canonicalizeShuffleMaskWithCommute(Mask)) {
std::swap(V1, V2);
if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
return V;
// For each vector width, delegate to a specialized lowering routine.
if (VT.is128BitVector())
return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is256BitVector())
return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (VT.is512BitVector())
return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
if (Is1BitVector)
return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
/// Try to lower a VSELECT instruction to a vector shuffle.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
MVT VT = Op.getSimpleValueType();
// Only non-legal VSELECTs reach this lowering, convert those into generic
// shuffles and re-use the shuffle lowering path for blends.
SmallVector<int, 32> Mask;
if (createShuffleMaskFromVSELECT(Mask, Cond))
return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
return SDValue();
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
// A vselect where all conditions and data are constants can be optimized into
// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
return SDValue();
// Try to lower this to a blend-style vector shuffle. This can handle all
// constant condition cases.
if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
return BlendOp;
// If this VSELECT has a vector if i1 as a mask, it will be directly matched
// with patterns on the mask registers on AVX-512.
MVT CondVT = Cond.getSimpleValueType();
unsigned CondEltSize = Cond.getScalarValueSizeInBits();
if (CondEltSize == 1)
return Op;
// Variable blends are only legal from SSE4.1 onward.
if (!Subtarget.hasSSE41())
return SDValue();
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
unsigned EltSize = VT.getScalarSizeInBits();
unsigned NumElts = VT.getVectorNumElements();
// Expand v32i16/v64i8 without BWI.
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
return SDValue();
// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
// into an i1 condition so that we can use the mask-based 512-bit blend
// instructions.
if (VT.getSizeInBits() == 512) {
// Build a mask by testing the condition against zero.
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
DAG.getConstant(0, dl, CondVT),
// Now return a new VSELECT using the mask.
return DAG.getSelect(dl, VT, Mask, LHS, RHS);
// SEXT/TRUNC cases where the mask doesn't match the destination size.
if (CondEltSize != EltSize) {
// If we don't have a sign splat, rely on the expansion.
if (CondEltSize != DAG.ComputeNumSignBits(Cond))
return SDValue();
MVT NewCondSVT = MVT::getIntegerVT(EltSize);
MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
// Only some types will be legal on some subtargets. If we can emit a legal
// VSELECT-matching blend, return Op, and but if we need to expand, return
// a null value.
switch (VT.SimpleTy) {
// Most of the vector types have blends past SSE4.1.
return Op;
case MVT::v32i8:
// The byte blends for AVX vectors were introduced only in AVX2.
if (Subtarget.hasAVX2())
return Op;
return SDValue();
case MVT::v8i16:
case MVT::v16i16: {
// Bitcast everything to the vXi8 type and use a vXi8 vselect.
MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
Cond = DAG.getBitcast(CastVT, Cond);
LHS = DAG.getBitcast(CastVT, LHS);
RHS = DAG.getBitcast(CastVT, RHS);
SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
return DAG.getBitcast(VT, Select);
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
SDLoc dl(Op);
if (!Vec.getSimpleValueType().is128BitVector())
return SDValue();
if (VT.getSizeInBits() == 8) {
// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
// we're going to zero extend the register or fold the store.
if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
if (VT == MVT::f32) {
// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
// the result back to FR32 register. It's only worth matching if the
// result has a single use which is a store or a bitcast to i32. And in
// the case of a store, it's not worth it if the index is a constant 0,
// because a MOVSSmr can be used instead, which is smaller and faster.
if (!Op.hasOneUse())
return SDValue();
SDNode *User = *Op.getNode()->use_begin();
if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
(User->getOpcode() != ISD::BITCAST ||
User->getValueType(0) != MVT::i32))
return SDValue();
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx);
return DAG.getBitcast(MVT::f32, Extract);
if (VT == MVT::i32 || VT == MVT::i64)
return Op;
return SDValue();
/// Extract one bit from mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Vec = Op.getOperand(0);
SDLoc dl(Vec);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
MVT EltVT = Op.getSimpleValueType();
assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
"Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
// extend vector to VR512/128
if (!IdxC) {
unsigned NumElts = VecVT.getVectorNumElements();
// Extending v8i1/v16i1 to 512-bit get better performance on KNL
// than extending to 128/256bit.
MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
unsigned IdxVal = IdxC->getZExtValue();
if (IdxVal == 0) // the operation is legal
return Op;
// Extend to natively supported kshift.
unsigned NumElems = VecVT.getVectorNumElements();
MVT WideVecVT = VecVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
DAG.getUNDEF(WideVecVT), Vec,
DAG.getIntPtrConstant(0, dl));
// Use kshiftr instruction to move to the lower element.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(0, dl));
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
if (VecVT.getVectorElementType() == MVT::i1)
return ExtractBitFromMaskVector(Op, DAG, Subtarget);
if (!IdxC) {
// Its more profitable to go through memory (1 cycles throughput)
// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
// IACA tool was used to get performance estimation
// (
// example : extractelement <16 x i8> %a, i32 %i
// Block Throughput: 3.00 Cycles
// Throughput Bottleneck: Port5
// | Num Of | Ports pressure in cycles | |
// | Uops | 0 - DV | 5 | 6 | 7 | |
// ---------------------------------------------
// | 1 | | 1.0 | | | CP | vmovd xmm1, edi
// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
// Total Num Of Uops: 4
// Block Throughput: 1.00 Cycles
// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
// | | Ports pressure in cycles | |
// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
// ---------------------------------------------------------
// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
// Total Num Of Uops: 4
return SDValue();
unsigned IdxVal = IdxC->getZExtValue();
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
// Get the 128-bit vector.
Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
MVT EltVT = VecVT.getVectorElementType();
unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
// this can be done with a mask.
IdxVal &= ElemsPerChunk - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(IdxVal, dl));
assert(VecVT.is128BitVector() && "Unexpected vector length");
MVT VT = Op.getSimpleValueType();
if (VT.getSizeInBits() == 16) {
// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
// we're going to zero extend the register or fold the store (SSE41 only).
if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
if (Subtarget.hasSSE41())
if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
return Res;
// TODO: We only extract a single element from v16i8, we can probably afford
// to be more aggressive here before using the default approach of spilling to
// stack.
if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
// Extract either the lowest i32 or any i16, and extract the sub-byte.
int DWordIdx = IdxVal / 4;
if (DWordIdx == 0) {
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec),
DAG.getIntPtrConstant(DWordIdx, dl));
int ShiftVal = (IdxVal % 4) * 8;
if (ShiftVal != 0)
Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
DAG.getConstant(ShiftVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
int WordIdx = IdxVal / 2;
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
DAG.getBitcast(MVT::v8i16, Vec),
DAG.getIntPtrConstant(WordIdx, dl));
int ShiftVal = (IdxVal % 2) * 8;
if (ShiftVal != 0)
Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
DAG.getConstant(ShiftVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
if (VT.getSizeInBits() == 32) {
if (IdxVal == 0)
return Op;
// SHUFPS the element to the lowest double word, then movss.
int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getIntPtrConstant(0, dl));
if (VT.getSizeInBits() == 64) {
// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
// to match extract_elt for f64.
if (IdxVal == 0)
return Op;
// UNPCKHPD the element to the lowest double word, then movsd.
// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
// to a f64mem, the whole operation is folded into a single MOVHPDmr.
int Mask[2] = { 1, -1 };
Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getIntPtrConstant(0, dl));
return SDValue();
/// Insert one bit to mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
SDValue Elt = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
MVT VecVT = Vec.getSimpleValueType();
if (!isa<ConstantSDNode>(Idx)) {
// Non constant index. Extend source and destination,
// insert element and then truncate the result.
unsigned NumElts = VecVT.getVectorNumElements();
MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
// Copy into a k-register, extract to v1i1 and insert_subvector.
SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
if (EltVT == MVT::i1)
return InsertBitToMaskVector(Op, DAG, Subtarget);
SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
auto *N2C = dyn_cast<ConstantSDNode>(N2);
if (!N2C || N2C->getAPIntValue().uge(NumElts))
return SDValue();
uint64_t IdxVal = N2C->getZExtValue();
bool IsZeroElt = X86::isZeroNode(N1);
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
// If we are inserting a element, see if we can do this more efficiently with
// a blend shuffle with a rematerializable vector than a costly integer
// insertion.
if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
16 <= EltVT.getSizeInBits()) {
SmallVector<int, 8> BlendMask;
for (unsigned i = 0; i != NumElts; ++i)
BlendMask.push_back(i == IdxVal ? i + NumElts : i);
SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
: getOnesVector(VT, DAG, dl);
return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
// into that, and then insert the subvector back into the result.
if (VT.is256BitVector() || VT.is512BitVector()) {
// With a 256-bit vector, we can insert into the zero element efficiently
// using a blend if we have AVX or AVX2 and the right data type.
if (VT.is256BitVector() && IdxVal == 0) {
// TODO: It is worthwhile to cast integer to floating point and back
// and incur a domain crossing penalty if that's what we'll end up
// doing anyway after extracting to a 128-bit vector.
if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
DAG.getTargetConstant(1, dl, MVT::i8));
// Get the desired 128-bit vector chunk.
SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired chunk.
unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
DAG.getIntPtrConstant(IdxIn128, dl));
// Insert the changed part back into the bigger vector
return insert128BitVector(N0, V, IdxVal, DAG, dl);
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
// This will be just movd/movq/movss/movsd.
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
EltVT == MVT::i64) {
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
// We can't directly insert an i8 or i16 into a vector, so zero extend
// it to i32 first.
if (EltVT == MVT::i16 || EltVT == MVT::i8) {
N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
return DAG.getBitcast(VT, N1);
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
// argument. SSE41 required for pinsrb.
if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
unsigned Opc;
if (VT == MVT::v8i16) {
assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
} else {
assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
if (N1.getValueType() != MVT::i32)
N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
if (N2.getValueType() != MVT::i32)
N2 = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(Opc, dl, VT, N0, N1, N2);
if (Subtarget.hasSSE41()) {
if (EltVT == MVT::f32) {
// Bits [7:6] of the constant are the source select. This will always be
// zero here. The DAG Combiner may combine an extract_elt index into
// these bits. For example (insert (extract, 3), 2) could be matched by
// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
// Bits [5:4] of the constant are the destination select. This is the
// value of the incoming immediate.
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
// than an insertps. Blends are simpler operations in hardware and so
// will always have equal or better performance than insertps.
// But if optimizing for size and there's a load folding opportunity,
// generate insertps because blendps does not have a 32-bit memory
// operand form.
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
DAG.getTargetConstant(1, dl, MVT::i8));
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
// PINSR* works with constant index.
if (EltVT == MVT::i32 || EltVT == MVT::i64)
return Op;
return SDValue();
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
MVT OpVT = Op.getSimpleValueType();
// It's always cheaper to replace a xor+movd with xorps and simplifies further
// combines.
if (X86::isZeroNode(Op.getOperand(0)))
return getZeroVector(OpVT, Subtarget, DAG, dl);
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
unsigned SizeFactor = OpVT.getSizeInBits() / 128;
MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
OpVT.getVectorNumElements() / SizeFactor);
Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
// Insert the 128-bit vector.
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
"Expected an SSE type!");
// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
if (OpVT == MVT::v4i32)
return Op;
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
return DAG.getBitcast(
OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
// simple superregister reference or explicit instructions to insert
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
return insert1BitVector(Op, DAG, Subtarget);
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering");
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
uint64_t IdxVal = Op.getConstantOperandVal(1);
if (IdxVal == 0) // the operation is legal
return Op;
MVT VecVT = Vec.getSimpleValueType();
unsigned NumElems = VecVT.getVectorNumElements();
// Extend to natively supported kshift.
MVT WideVecVT = VecVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
DAG.getUNDEF(WideVecVT), Vec,
DAG.getIntPtrConstant(0, dl));
// Shift to the LSB.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(0, dl));
// Returns the appropriate wrapper opcode for a global reference.
unsigned X86TargetLowering::getGlobalWrapperKind(
const GlobalValue *GV, const unsigned char OpFlags) const {
// References to absolute symbols are never PC-relative.
if (GV && GV->isAbsoluteSymbolRef())
return X86ISD::Wrapper;
CodeModel::Model M = getTargetMachine().getCodeModel();
if (Subtarget.isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
return X86ISD::WrapperRIP;
// GOTPCREL references must always use RIP.
if (OpFlags == X86II::MO_GOTPCREL)
return X86ISD::WrapperRIP;
return X86ISD::Wrapper;
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOV32ri.
X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetConstantPool(
CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (OpFlag) {
Result =
DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
return Result;
SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (OpFlag)
Result =
DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
return Result;
SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
SelectionDAG &DAG) const {
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
// Create the TargetBlockAddressAddress node.
unsigned char OpFlags =
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (isGlobalRelativeToPICBase(OpFlags)) {
Result = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
return Result;
/// Creates target global address or external symbol nodes for calls or
/// other uses.
SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
bool ForCall) const {
// Unpack the global address or external symbol.
const SDLoc &dl = SDLoc(Op);
const GlobalValue *GV = nullptr;
int64_t Offset = 0;
const char *ExternalSym = nullptr;
if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
GV = G->getGlobal();
Offset = G->getOffset();
} else {
const auto *ES = cast<ExternalSymbolSDNode>(Op);
ExternalSym = ES->getSymbol();
// Calculate some flags for address lowering.
const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
unsigned char OpFlags;
if (ForCall)
OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
bool NeedsLoad = isGlobalStubReference(OpFlags);
CodeModel::Model M = DAG.getTarget().getCodeModel();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result;
if (GV) {
// Create a target global address if this is a global. If possible, fold the
// offset into the global address reference. Otherwise, ADD it on later.
int64_t GlobalOffset = 0;
if (OpFlags == X86II::MO_NO_FLAG &&
X86::isOffsetSuitableForCodeModel(Offset, M)) {
std::swap(GlobalOffset, Offset);
Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
} else {
// If this is not a global address, this must be an external symbol.
Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
// If this is a direct call, avoid the wrapper if we don't need to do any
// loads or adds. This allows SDAG ISel to match direct calls.
if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
return Result;
Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
if (HasPICReg) {
Result = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
// For globals that require a load from a stub to get the address, emit the
// load.
if (NeedsLoad)
Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
// If there was a non-zero offset that we didn't fold, create an explicit
// addition for it.
if (Offset != 0)
Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
DAG.getConstant(Offset, dl, PtrVT));
return Result;
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
static SDValue
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
unsigned char OperandFlags, bool LocalDynamic = false) {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SDLoc dl(GA);
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
if (InFlag) {
SDValue Ops[] = { Chain, TGA, *InFlag };
Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
} else {
SDValue Ops[] = { Chain, TGA };
Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
SDValue Flag = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
SDValue InFlag;
SDLoc dl(GA); // ? function entry point might be better
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
SDLoc(), PtrVT), InFlag);
InFlag = Chain.getValue(1);
return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
SelectionDAG &DAG,
const EVT PtrVT,
bool is64Bit) {
SDLoc dl(GA);
// Get the start address of the TLS block for this module.
X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
SDValue Base;
if (is64Bit) {
Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
X86II::MO_TLSLD, /*LocalDynamic=*/true);
} else {
SDValue InFlag;
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
InFlag = Chain.getValue(1);
Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
X86II::MO_TLSLDM, /*LocalDynamic=*/true);
// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
// of Base.
// Build x@dtpoff.
unsigned char OperandFlags = X86II::MO_DTPOFF;
unsigned WrapperKind = X86ISD::Wrapper;
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
// Add x@dtpoff with the base.
return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT, TLSModel::Model model,
bool is64Bit, bool isPIC) {
SDLoc dl(GA);
// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
is64Bit ? 257 : 256));
SDValue ThreadPointer =
DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
unsigned char OperandFlags = 0;
// Most TLS accesses are not RIP relative, even on x86-64. One exception is
// initialexec.
unsigned WrapperKind = X86ISD::Wrapper;
if (model == TLSModel::LocalExec) {
OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
} else if (model == TLSModel::InitialExec) {
if (is64Bit) {
OperandFlags = X86II::MO_GOTTPOFF;
WrapperKind = X86ISD::WrapperRIP;
} else {
} else {
llvm_unreachable("Unexpected model");
// emit "addl x@ntpoff,%eax" (local exec)
// or "addl x@indntpoff,%eax" (initial exec)
// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
SDValue TGA =
DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
if (model == TLSModel::InitialExec) {
if (isPIC && !is64Bit) {
Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
// The address of the thread local variable is the add of the thread
// pointer with the offset of the variable.
return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
const GlobalValue *GV = GA->getGlobal();
auto PtrVT = getPointerTy(DAG.getDataLayout());
bool PositionIndependent = isPositionIndependent();
if (Subtarget.isTargetELF()) {
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
if (Subtarget.is64Bit())
return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
case TLSModel::LocalDynamic:
return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
case TLSModel::InitialExec:
case TLSModel::LocalExec:
return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
llvm_unreachable("Unknown TLS model.");
if (Subtarget.isTargetDarwin()) {
// Darwin only has one model of TLS. Lower to that.
unsigned char OpFlag = 0;
unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
X86ISD::WrapperRIP : X86ISD::Wrapper;
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
if (PIC32)
OpFlag = X86II::MO_TLVP;
SDLoc DL(Op);
SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
GA->getOffset(), OpFlag);
SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
// With PIC32, the address is actually $g + Offset.
if (PIC32)
Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
// Lowering the machine isd will make sure everything is in the right
// location.
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
SDValue Args[] = { Chain, Offset };
Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
DAG.getIntPtrConstant(0, DL, true),
Chain.getValue(1), DL);
// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
// And our return value (tls address) is in the standard call return value
// location.
unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
if (Subtarget.isOSWindows()) {
// Just use the implicit TLS architecture
// Need to generate something similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
// ; from TEB
// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
// mov rcx, qword [rdx+rcx*8]
// mov eax, .tls$:tlsvar
// [rax+rcx] contains the address
// Windows 64bit: gs:0x58
// Windows 32bit: fs:__tls_array
SDLoc dl(GA);
SDValue Chain = DAG.getEntryNode();
// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
// use its literal value of 0x2C.
Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
? Type::getInt8PtrTy(*DAG.getContext(),
: Type::getInt32PtrTy(*DAG.getContext(),
SDValue TlsArray = Subtarget.is64Bit()
? DAG.getIntPtrConstant(0x58, dl)
: (Subtarget.isTargetWindowsGNU()
? DAG.getIntPtrConstant(0x2C, dl)
: DAG.getExternalSymbol("_tls_array", PtrVT));
SDValue ThreadPointer =
DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
SDValue res;
if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
res = ThreadPointer;
} else {
// Load the _tls_index variable
SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
if (Subtarget.is64Bit())
IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
MachinePointerInfo(), MVT::i32);
IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
auto &DL = DAG.getDataLayout();
SDValue Scale =
DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
// Get the offset of start of .tls section
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
GA->getOffset(), X86II::MO_SECREL);
SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
// The address of the thread local variable is the add of the thread
// pointer with the offset of the variable.
return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
llvm_unreachable("TLS not implemented for this target.");
/// Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
/// TODO: Can this be moved to general expansion code?
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
MVT VT = Op.getSimpleValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
// during isel.
SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits - 1, dl, MVT::i8));
SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
DAG.getConstant(VTBits - 1, dl, MVT::i8))
: DAG.getConstant(0, dl, VT);
SDValue Tmp2, Tmp3;
if (Op.getOpcode() == ISD::SHL_PARTS) {
Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
} else {
Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
// If the shift amount is larger or equal than the width of a part we can't
// rely on the results of shld/shrd. Insert a test and select the appropriate
// values for large shift amounts.
SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i8));
SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
SDValue Hi, Lo;
if (Op.getOpcode() == ISD::SHL_PARTS) {
Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
} else {
Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
return DAG.getMergeValues({ Lo, Hi }, dl);
static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
"Unexpected funnel shift opcode!");
SDLoc DL(Op);
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Amt = Op.getOperand(2);
bool IsFSHR = Op.getOpcode() == ISD::FSHR;
if (VT.isVector()) {
assert(Subtarget.hasVBMI2() && "Expected VBMI2");
if (IsFSHR)
std::swap(Op0, Op1);
APInt APIntShiftAmt;
if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
Op0, Op1, Amt);
(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
bool OptForSize = DAG.shouldOptForSize();
bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
!isa<ConstantSDNode>(Amt)) {
unsigned EltSizeInBits = VT.getScalarSizeInBits();
SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
if (IsFSHR) {
Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
} else {
Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
return DAG.getZExtOrTrunc(Res, DL, VT);
if (VT == MVT::i8 || ExpandFunnel)
return SDValue();
// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
if (VT == MVT::i16) {
Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
DAG.getConstant(15, DL, Amt.getValueType()));
unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
return Op;
// Try to use a packed vector operation to handle i64 on 32-bit targets when
// AVX512DQ is enabled.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((Op.getOpcode() == ISD::SINT_TO_FP ||
Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
Op.getOpcode() == ISD::UINT_TO_FP) &&
"Unexpected opcode!");
bool IsStrict = Op->isStrictFPOpcode();
unsigned OpNo = IsStrict ? 1 : 0;
SDValue Src = Op.getOperand(OpNo);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
(VT != MVT::f32 && VT != MVT::f64))
return SDValue();
// Pack the i64 into a vector, do the operation and extract.
// Using 256-bit to ensure result is 128-bits for f32 case.
unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
MVT VecVT = MVT::getVectorVT(VT, NumElts);
SDLoc dl(Op);
SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
if (IsStrict) {
SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
{Op.getOperand(0), InVec});
SDValue Chain = CvtVec.getValue(1);
SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
DAG.getIntPtrConstant(0, dl));
return DAG.getMergeValues({Value, Chain}, dl);
SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
DAG.getIntPtrConstant(0, dl));
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
const X86Subtarget &Subtarget) {
switch (Opcode) {
// TODO: Handle wider types with AVX/AVX512.
if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
return false;
return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
// TODO: Handle wider types and i64 elements.
if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
return false;
return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
return false;
/// Given a scalar cast operation that is extracted from a vector, try to
/// vectorize the cast op followed by extraction. This will avoid an expensive
/// round-trip between XMM and GPR.
static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// TODO: This could be enhanced to handle smaller integer types by peeking
// through an extend.
SDValue Extract = Cast.getOperand(0);
MVT DestVT = Cast.getSimpleValueType();
if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
return SDValue();
// See if we have a 128-bit vector cast op for this type of cast.
SDValue VecOp = Extract.getOperand(0);
MVT FromVT = VecOp.getSimpleValueType();
unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
return SDValue();
// If we are extracting from a non-zero element, first shuffle the source
// vector to allow extracting from element zero.
SDLoc DL(Cast);
if (!isNullConstant(Extract.getOperand(1))) {
SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
Mask[0] = Extract.getConstantOperandVal(1);
VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
// If the source vector is wider than 128-bits, extract the low part. Do not
// create an unnecessarily wide vector cast op.
if (FromVT != Vec128VT)
VecOp = extract128BitVector(VecOp, 0, DAG, DL);
// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
DAG.getIntPtrConstant(0, DL));
/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
/// try to vectorize the cast ops. This will avoid an expensive round-trip
/// between XMM and GPR.
static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// TODO: Allow FP_TO_UINT.
SDValue CastToInt = CastToFP.getOperand(0);
MVT VT = CastToFP.getSimpleValueType();
if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
return SDValue();
MVT IntVT = CastToInt.getSimpleValueType();
SDValue X = CastToInt.getOperand(0);
MVT SrcVT = X.getSimpleValueType();
if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
return SDValue();
// See if we have 128-bit vector cast instructions for this type of cast.
// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
IntVT != MVT::i32)
return SDValue();
unsigned SrcSize = SrcVT.getSizeInBits();
unsigned IntSize = IntVT.getSizeInBits();
unsigned VTSize = VT.getSizeInBits();
MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
unsigned ToIntOpcode =
SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
unsigned ToFPOpcode =
IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
// We are not defining the high elements (for example, zero them) because
// that could nullify any performance advantage that we hoped to gain from
// this vector op hack. We do not expect any adverse effects (like denorm
// penalties) with cast ops.
SDLoc DL(CastToFP);
SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
bool IsStrict = Op->isStrictFPOpcode();
MVT VT = Op->getSimpleValueType(0);
SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
if (Subtarget.hasDQI()) {
assert(!Subtarget.hasVLX() && "Unexpected features");
assert((Src.getSimpleValueType() == MVT::v2i64 ||
Src.getSimpleValueType() == MVT::v4i64) &&
"Unsupported custom type");
// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!");
MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
// Need to concat with zero vector for strict fp to avoid spurious
// exceptions.
SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
: DAG.getUNDEF(MVT::v8i64);
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
DAG.getIntPtrConstant(0, DL));
SDValue Res, Chain;
if (IsStrict) {
Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
{Op->getOperand(0), Src});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
DAG.getIntPtrConstant(0, DL));
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, DL);
return Res;
bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
if (VT != MVT::v4f32 || IsSigned)
return SDValue();
SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
SmallVector<SDValue, 4> SignCvts(4);
SmallVector<SDValue, 4> Chains(4);
for (int i = 0; i != 4; ++i) {
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
DAG.getIntPtrConstant(i, DL));
if (IsStrict) {
SignCvts[i] =
DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
{Op.getOperand(0), Elt});
Chains[i] = SignCvts[i].getValue(1);
} else {
SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
SDValue Slow, Chain;
if (IsStrict) {
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
{Chain, SignCvt, SignCvt});
Chain = Slow.getValue(1);
} else {
Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
if (IsStrict)
return DAG.getMergeValues({Cvt, Chain}, DL);
return Cvt;
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
unsigned OpNo = IsStrict ? 1 : 0;
SDValue Src = Op.getOperand(OpNo);
SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
return R;
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
// Note: Since v2f64 is a legal type. We don't need to zero extend the
// source for strict FP.
if (IsStrict)
return DAG.getNode(
X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
return SDValue();
assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
// These are really Legal; return the operand so the caller accepts it as
// Legal.
if (SrcVT == MVT::i32 && UseSSEReg)
return Op;
if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
return Op;
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
// SSE doesn't have an i16 conversion so we need to promote.
if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
{Chain, Ext});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
if (VT == MVT::f128)
return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
SDValue ValueToStore = Src;
if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
unsigned Size = SrcVT.getStoreSize();
Align Alignment(Size);
MachineFunction &MF = DAG.getMachineFunction();
auto PtrVT = getPointerTy(MF.getDataLayout());
int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
std::pair<SDValue, SDValue> Tmp =
BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
return Tmp.first;
std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
// Build the FILD
SDVTList Tys;
bool useSSE = isScalarFPTypeInSSEReg(DstVT);
if (useSSE)
Tys = DAG.getVTList(MVT::f80, MVT::Other);
Tys = DAG.getVTList(DstVT, MVT::Other);
SDValue FILDOps[] = {Chain, Pointer};
SDValue Result =
DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
Alignment, MachineMemOperand::MOLoad);
Chain = Result.getValue(1);
if (useSSE) {
MachineFunction &MF = DAG.getMachineFunction();
unsigned SSFISize = DstVT.getStoreSize();
int SSFI =
MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
auto PtrVT = getPointerTy(MF.getDataLayout());
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Tys = DAG.getVTList(MVT::Other);
SDValue FSTOps[] = {Chain, Result, StackSlot};
MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
Chain =
DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
Result = DAG.getLoad(
DstVT, DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
Chain = Result.getValue(1);
return { Result, Chain };
/// Horizontal vector math instructions may be slower than normal math with
/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
bool IsOptimizingSize = DAG.shouldOptForSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// This algorithm is not obvious. Here it is what we're trying to output:
movq %rax, %xmm0
punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
#ifdef __SSE3__
haddpd %xmm0, %xmm0
pshufd $0x4e, %xmm0, %xmm1
addpd %xmm1, %xmm0
bool IsStrict = Op->isStrictFPOpcode();
unsigned OpNo = IsStrict ? 1 : 0;
SDLoc dl(Op);
LLVMContext *Context = DAG.getContext();
// Build some magic constants.
static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
Constant *C0 = ConstantDataVector::get(*Context, CV0);
auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
SmallVector<Constant*,2> CV1;
ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
APInt(64, 0x4330000000000000ULL))));
ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
// Load the 64-bit value into an XMM register.
SDValue XR1 =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
SDValue CLod0 =
DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
/* Alignment = */ 16);
SDValue Unpck1 =
getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
SDValue CLod1 =
DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
/* Alignment = */ 16);
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
SDValue Sub;
SDValue Chain;
// TODO: Are there any fast-math-flags to propagate here?
if (IsStrict) {
Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
{Op.getOperand(0), XR2F, CLod1});
Chain = Sub.getValue(1);
} else
Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
if (!IsStrict && Subtarget.hasSSE3() &&
shouldUseHorizontalOp(true, DAG, Subtarget)) {
// FIXME: Do we need a STRICT version of FHADD?
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
if (IsStrict) {
Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
{Chain, Shuffle, Sub});
Chain = Result.getValue(1);
} else
Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Result, Chain}, dl);
return Result;
/// 32-bit unsigned integer to float expansion.
static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
SDLoc dl(Op);
// FP constant to bias correct the final result.
SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
// Load the 32-bit value into an XMM register.
SDValue Load =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
// Zero out the upper parts of the register.
Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
// Or the load with the bias.
SDValue Or = DAG.getNode(
ISD::OR, dl, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, Load),
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
Or =
DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
if (Op.getNode()->isStrictFPOpcode()) {
// Subtract the bias.
// TODO: Are there any fast-math-flags to propagate here?
SDValue Chain = Op.getOperand(0);
SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
{Chain, Or, Bias});
if (Op.getValueType() == Sub.getValueType())
return Sub;
// Handle final rounding.
std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
// Subtract the bias.
// TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
// Handle final rounding.
return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
if (Op.getSimpleValueType() != MVT::v2f64)
return SDValue();
bool IsStrict = Op->isStrictFPOpcode();
SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
if (Subtarget.hasAVX512()) {
if (!Subtarget.hasVLX()) {
// Let generic type legalization widen this.
if (!IsStrict)
return SDValue();
// Otherwise pad the integer input with 0s and widen the operation.
N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
DAG.getConstant(0, DL, MVT::v2i32));
SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
{Op.getOperand(0), N0});
SDValue Chain = Res.getValue(1);
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
DAG.getIntPtrConstant(0, DL));
return DAG.getMergeValues({Res, Chain}, DL);
// Legalize to v4i32 type.
N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
if (IsStrict)
return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
{Op.getOperand(0), N0});
return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
// Zero extend to 2i64, OR with the floating point representation of 2^52.
// This gives us the floating point equivalent of 2^52 + the i32 integer
// since double has 52-bits of mantissa. Then subtract 2^52 in floating
// point leaving just our i32 integers in double format.
SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
SDValue VBias =
DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
DAG.getBitcast(MVT::v2i64, VBias));
Or = DAG.getBitcast(MVT::v2f64, Or);
if (IsStrict)
return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
{Op.getOperand(0), Or, VBias});
return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
bool IsStrict = Op->isStrictFPOpcode();
SDValue V = Op->getOperand(IsStrict ? 1 : 0);
MVT VecIntVT = V.getSimpleValueType();
assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type");
if (Subtarget.hasAVX512()) {
// With AVX512, but not VLX we need to widen to get a 512-bit result type.
assert(!Subtarget.hasVLX() && "Unexpected features");
MVT VT = Op->getSimpleValueType(0);
// v8i32->v8f64 is legal with AVX512 so just return it.
if (VT == MVT::v8f64)
return Op;
assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!");
MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
// Need to concat with zero vector for strict fp to avoid spurious
// exceptions.
SDValue Tmp =
IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
DAG.getIntPtrConstant(0, DL));
SDValue Res, Chain;
if (IsStrict) {
Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
{Op->getOperand(0), V});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
DAG.getIntPtrConstant(0, DL));
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, DL);
return Res;
if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
Op->getSimpleValueType(0) == MVT::v4f64) {
SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
Constant *Bias = ConstantFP::get(
APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
SDValue VBias = DAG.getMemIntrinsicNode(
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
DAG.getBitcast(MVT::v4i64, VBias));
Or = DAG.getBitcast(MVT::v4f64, Or);
if (IsStrict)
return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
{Op.getOperand(0), Or, VBias});
return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
// The algorithm is the following:
// #ifdef __SSE4_1__
// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
// (uint4) 0x53000000, 0xaa);
// #else
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
// uint4 hi = (v >> 16) | (uint4) 0x53000000;
// #endif
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// return (float4) lo + fhi;
bool Is128 = VecIntVT == MVT::v4i32;
MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
// If we convert to something else than the supported type, e.g., to v4f64,
// abort early.
if (VecFloatVT != Op->getSimpleValueType(0))
return SDValue();
// In the #idef/#else code, we have in common:
// - The vector of constants:
// -- 0x4b000000
// -- 0x53000000
// - A shift:
// -- v >> 16
// Create the splat vector for 0x4b000000.
SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
// Create the splat vector for 0x53000000.
SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
// Create the right shift.
SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
SDValue Low, High;
if (Subtarget.hasSSE41()) {
MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
// Low will be bitcasted right away, so do not bother bitcasting back to its
// original type.
Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
// (uint4) 0x53000000, 0xaa);
SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
// High will be bitcasted right away, so do not bother bitcasting back to
// its original type.
High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
} else {
SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
// uint4 hi = (v >> 16) | (uint4) 0x53000000;
High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
// Create the vector constant for (0x1.0p39f + 0x1.0p23f).
SDValue VecCstFSub = DAG.getConstantFP(
APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// NOTE: By using fsub of a positive constant instead of fadd of a negative
// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
// enabled. See PR24512.
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
// TODO: Are there any fast-math-flags to propagate here?
// (float4) lo;
SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
// return (float4) lo + fhi;
if (IsStrict) {
SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
{Op.getOperand(0), HighBitcast, VecCstFSub});
return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
{FHigh.getValue(1), LowBitcast, FHigh});
SDValue FHigh =
DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
SDValue N0 = Op.getOperand(OpNo);
MVT SrcVT = N0.getSimpleValueType();
SDLoc dl(Op);
switch (SrcVT.SimpleTy) {
llvm_unreachable("Custom UINT_TO_FP is not supported!");
case MVT::v2i32:
return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
case MVT::v4i32:
case MVT::v8i32:
return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
case MVT::v2i64:
case MVT::v4i64:
return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
unsigned OpNo = IsStrict ? 1 : 0;
SDValue Src = Op.getOperand(OpNo);
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
MVT SrcVT = Src.getSimpleValueType();
MVT DstVT = Op->getSimpleValueType(0);
SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
if (DstVT == MVT::f128)
return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
if (DstVT.isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
// Conversions from unsigned i32 to f32/f64 are legal,
// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
return Op;
// Promote i32 to i64 and use a signed conversion on 64-bit targets.
if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
{Chain, Src});
return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
if (SrcVT == MVT::i32) {
SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
SDValue Store1 =
DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/);
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
OffsetSlot, MPI.getWithOffset(4), 4);
std::pair<SDValue, SDValue> Tmp =
BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
return Tmp.first;
assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
SDValue ValueToStore = Src;
if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
SDValue Store =
DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
// For i64 source, we need to add the appropriate power of 2 if the input
// was negative. This is the same as the optimization in
// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
// we must be careful to do the computation in x87 extended precision, not
// in SSE. (The generic code can't know it's OK to do this, or how to.)
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot };
SDValue Fild =
DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
Align(8), MachineMemOperand::MOLoad);
Chain = Fild.getValue(1);
// Check whether the sign bit is set.
SDValue SignSet = DAG.getSetCC(
dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
APInt FF(64, 0x5F80000000000000ULL);
SDValue FudgePtr = DAG.getConstantPool(
ConstantInt::get(*DAG.getContext(), FF), PtrVT);
Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
SDValue Four = DAG.getIntPtrConstant(4, dl);
SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
// Load the value out, extending it from f32 to f80.
SDValue Fudge = DAG.getExtLoad(
ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
Chain = Fudge.getValue(1);
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
if (IsStrict) {
SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
{Chain, Fild, Fudge});
// STRICT_FP_ROUND can't handle equal types.
if (DstVT == MVT::f80)
return Add;
return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
DAG.getIntPtrConstant(0, dl));
// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
// just return an SDValue().
// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
// to i16, i32 or i64, and we lower it to a legal sequence and return the
// result.
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
bool IsSigned, SDValue &Chain) const {
bool IsStrict = Op->isStrictFPOpcode();
SDLoc DL(Op);
EVT DstTy = Op.getValueType();
SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
EVT TheVT = Value.getValueType();
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
// f16 must be promoted before using the lowering in this routine.
// fp128 does not use this lowering.
return SDValue();
// If using FIST to compute an unsigned i64, we'll need some fixup
// to handle values above the maximum signed i64. A FIST is always
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
// FIXME: This does not generate an invalid exception if the input does not
// fit in i32. PR44019
if (!IsSigned && DstTy != MVT::i64) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
DstTy = MVT::i64;
assert(DstTy.getSimpleVT() <= MVT::i64 &&
DstTy.getSimpleVT() >= MVT::i16 &&
"Unknown FP_TO_INT to lower!");
// We lower FP->int64 into FISTP64 followed by a load from a temporary
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
unsigned MemSize = DstTy.getStoreSize();
int SSFI =
MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
if (UnsignedFixup) {
// Conversion to unsigned i64 is implemented with a select,
// depending on whether the source value fits in the range
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
// Adjust = (Value < Thresh) ? 0 : 0x80000000;
// FltOfs = (Value < Thresh) ? 0 : 0x80000000;
// FistSrc = (Value - FltOfs);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
// to XOR'ing the high 32 bits with Adjust.
// Being a power of 2, Thresh is exactly representable in all FP formats.
// For X87 we'd like to use the smallest FP type for this constant, but
// for DAG type consistency we have to match the FP operand type.
APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
bool LosesInfo = false;
if (TheVT == MVT::f64)
// The rounding mode is irrelevant as the conversion should be exact.
Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
else if (TheVT == MVT::f80)
Status = Thresh.convert(APFloat::x87DoubleExtended(),
APFloat::rmNearestTiesToEven, &LosesInfo);
assert(Status == APFloat::opOK && !LosesInfo &&
"FP conversion should have been exact");
SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), TheVT);
SDValue Cmp;
if (IsStrict) {
Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
Chain, /*IsSignaling*/ true);
Chain = Cmp.getValue(1);
} else {
Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
DAG.getConstant(0, DL, MVT::i64),
DL, MVT::i64));
SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
DAG.getConstantFP(0.0, DL, TheVT),
if (IsStrict) {
Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
{ Chain, Value, FltOfs });
Chain = Value.getValue(1);
} else
Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
// FIXME This causes a redundant load/store if the SSE-class value is already
// in memory, such as if it is on the callstack.
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Chain, StackSlot };
unsigned FLDSize = TheVT.getStoreSize();
assert(FLDSize <= MemSize && "Stack slot not big enough");
MachineMemOperand *MMO = MF.getMachineMemOperand(
MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
Chain = Value.getValue(1);
// Build the FP_TO_INT*_IN_MEM
MachineMemOperand *MMO = MF.getMachineMemOperand(
MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
Ops, DstTy, MMO);
SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
Chain = Res.getValue(1);
// If we need an unsigned fixup, XOR the result with adjust.
if (UnsignedFixup)
Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
return Res;
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
unsigned Opc = Op.getOpcode();
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode");
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::i32 ||
VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type");
assert((InVT.getVectorElementType() == MVT::i8 ||
InVT.getVectorElementType() == MVT::i16 ||
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
assert(InVT == MVT::v32i8 && "Unexpected VT!");
return splitVectorIntUnary(Op, DAG);
if (Subtarget.hasInt256())
return Op;
// Optimize vectors in AVX mode:
// v8i16 -> v8i32
// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
// Concat upper and lower parts.
// v4i32 -> v4i64
// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
// Concat upper and lower parts.
MVT HalfVT = VT.getHalfNumVectorElementsVT();
SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
// Short-circuit if we can determine that each 128-bit half is the same value.
// Otherwise, this is difficult to match and optimize.
if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
SDValue Undef = DAG.getUNDEF(InVT);
bool NeedZero = Opc == ISD::ZERO_EXTEND;
SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
OpHi = DAG.getBitcast(HalfVT, OpHi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
const SDLoc &dl, SelectionDAG &DAG) {
assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
DAG.getIntPtrConstant(0, dl));
SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
DAG.getIntPtrConstant(8, dl));
Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
SDLoc DL(Op);
unsigned NumElts = VT.getVectorNumElements();
// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
// avoids a constant pool load.
if (VT.getVectorElementType() != MVT::i8) {
SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
return DAG.getNode(ISD::SRL, DL, VT, Extend,
DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
// Extend VT if BWI is not supported.
if (!Subtarget.hasBWI()) {
// If v16i32 is to be avoided, we'll need to split and concatenate.
if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
// Widen to 512-bits if VLX is not supported.
MVT WideVT = ExtVT;
if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
NumElts *= 512 / ExtVT.getSizeInBits();
InVT = MVT::getVectorVT(MVT::i1, NumElts);
In, DAG.getIntPtrConstant(0, DL));
WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
SDValue One = DAG.getConstant(1, DL, WideVT);
SDValue Zero = DAG.getConstant(0, DL, WideVT);
SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
// Truncate if we had to extend above.
if (VT != ExtVT) {
WideVT = MVT::getVectorVT(MVT::i8, NumElts);
SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
// Extract back to 128/256-bit if we widened.
if (WideVT != VT)
SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
DAG.getIntPtrConstant(0, DL));
return SelectedVal;
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op.getOperand(0);
MVT SVT = In.getSimpleValueType();
if (SVT.getVectorElementType() == MVT::i1)
return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
assert(Subtarget.hasAVX() && "Expected AVX support");
return LowerAVXExtend(Op, DAG, Subtarget);
/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
/// It makes use of the fact that vectors with enough leading sign/zero bits
/// prevent the PACKSS/PACKUS from saturating the results.
/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
/// within each 128-bit lane.
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode");
assert(DstVT.isVector() && "VT not a vector?");
// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
if (!Subtarget.hasSSE2())
return SDValue();
EVT SrcVT = In.getValueType();
// No truncation required, we might get here due to recursive calls.
if (SrcVT == DstVT)
return In;
// We only support vector truncation to 64bits or greater from a
// 128bits or greater source.
unsigned DstSizeInBits = DstVT.getSizeInBits();
unsigned SrcSizeInBits = SrcVT.getSizeInBits();
if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
return SDValue();
unsigned NumElems = SrcVT.getVectorNumElements();
if (!isPowerOf2_32(NumElems))
return SDValue();
LLVMContext &Ctx = *DAG.getContext();
assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
// Pack to the largest type possible:
// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
EVT InVT = MVT::i16, OutVT = MVT::i8;
if (SrcVT.getScalarSizeInBits() > 16 &&
(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
InVT = MVT::i32;
OutVT = MVT::i16;
// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
if (SrcVT.is128BitVector()) {
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
In = DAG.getBitcast(InVT, In);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
Res = extractSubVector(Res, 0, DAG, DL, 64);
return DAG.getBitcast(DstVT, Res);
// Split lower/upper subvectors.
SDValue Lo, Hi;
std::tie(Lo, Hi) = splitVector(In, DAG, DL);
unsigned SubSizeInBits = SrcSizeInBits / 2;
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
Lo = DAG.getBitcast(InVT, Lo);
Hi = DAG.getBitcast(InVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
return DAG.getBitcast(DstVT, Res);
// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
Lo = DAG.getBitcast(InVT, Lo);
Hi = DAG.getBitcast(InVT, Hi);
SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
SmallVector<int, 64> Mask;
int Scale = 64 / OutVT.getScalarSizeInBits();
narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
if (DstVT.is256BitVector())
return DAG.getBitcast(DstVT, Res);
// If 512bit -> 128bit truncate another stage.
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
Res = DAG.getBitcast(PackedVT, Res);
return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
// Recursively pack lower/upper subvectors, concat result and pack again.
assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
if (InVT.getScalarSizeInBits() <= 16) {
if (Subtarget.hasBWI()) {
// legal, will go to VPMOVB2M, VPMOVW2M
if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
// We need to shift to get the lsb into sign position.
// Shift packed bytes not supported natively, bitcast to word
MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
In = DAG.getNode(ISD::SHL, DL, ExtVT,
DAG.getBitcast(ExtVT, In),
DAG.getConstant(ShiftInx, DL, ExtVT));
In = DAG.getBitcast(InVT, In);
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
// Use TESTD/Q, extended vector to packed dword/qword.
assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.");
unsigned NumElts = InVT.getVectorNumElements();
assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
// We need to change to a wider element type that we have support for.
// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
// For 16 element vectors we extend to v16i32 unless we are explicitly
// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
// we need to split into two 8 element vectors which we can extend to v8i32,
// truncate and concat the results. There's an additional complication if
// the original type is v16i8. In that case we can't split the v16i8
// directly, so we need to shuffle high elements to low and use
// sign_extend_vector_inreg.
if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
SDValue Lo, Hi;
if (InVT == MVT::v16i8) {
Hi = DAG.getVectorShuffle(
InVT, DL, In, In,
{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
} else {
assert(InVT == MVT::v16i16 && "Unexpected VT!");
Lo = extract128BitVector(In, 0, DAG, DL);
Hi = extract128BitVector(In, 8, DAG, DL);
// We're split now, just emit two truncates and a concat. The two
// truncates will trigger legalization to come back to this function.
Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
// We either have 8 elements or we're allowed to use 512-bit vectors.
// If we have VLX, we want to use the narrowest vector that can get the
// job done so we use vXi32.
MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
InVT = ExtVT;
ShiftInx = InVT.getScalarSizeInBits() - 1;
if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
// We need to shift to get the lsb into sign position.
In = DAG.getNode(ISD::SHL, DL, InVT, In,
DAG.getConstant(ShiftInx, DL, InVT));
// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
if (Subtarget.hasDQI())
return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
unsigned InNumEltBits = InVT.getScalarSizeInBits();
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
// If we're called by the type legalizer, handle a few cases.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(InVT)) {
if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
VT.is128BitVector()) {
assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
"Unexpected subtarget!");
// The default behavior is to truncate one step, concatenate, and then
// truncate the remainder. We'd rather produce two 64-bit results and
// concatenate those.
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
// Otherwise let default legalization handle it.
return SDValue();
if (VT.getVectorElementType() == MVT::i1)
return LowerTruncateVecI1(Op, DAG, Subtarget);
// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {
if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
assert(VT == MVT::v32i8 && "Unexpected VT!");
return splitVectorIntUnary(Op, DAG);
// word to byte only under BWI. Otherwise we have to promoted to v16i32
// and then truncate that. But we should only do that if we haven't been
// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
// handled by isel patterns.
if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
return Op;
unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
// Truncate with PACKUS if we are truncating a vector with leading zero bits
// that extend all the way to the packed/truncated value.
// Pre-SSE41 we can only use PACKUSWB.
KnownBits Known = DAG.computeKnownBits(In);
if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
if (SDValue V =
truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
return V;
// Truncate with PACKSS if we are truncating a vector with sign-bits that
// extend all the way to the packed/truncated value.
if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
if (SDValue V =
truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
return V;
// Handle truncation of V256 to V128 using shuffles.
assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
In = DAG.getBitcast(MVT::v8i32, In);
In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
DAG.getIntPtrConstant(0, DL));
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0, DL));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(2, DL));
OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
static const int ShufMask[] = {0, 2, 4, 6};
return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
if (Subtarget.hasInt256()) {
In = DAG.getBitcast(MVT::v32i8, In);
// The PSHUFB mask:
static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
-1, -1, -1, -1, -1, -1, -1, -1,
16, 17, 20, 21, 24, 25, 28, 29,
-1, -1, -1, -1, -1, -1, -1, -1 };
In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
In = DAG.getBitcast(MVT::v4i64, In);
static const int ShufMask2[] = {0, 2, -1, -1};
In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0, DL));
return DAG.getBitcast(VT, In);
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
DAG.getIntPtrConstant(0, DL));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
DAG.getIntPtrConstant(4, DL));
OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
// The PSHUFB mask:
static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
-1, -1, -1, -1, -1, -1, -1, -1};
OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
// The MOVLHPS Mask:
static const int ShufMask2[] = {0, 1, 4, 5};
SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
return DAG.getBitcast(MVT::v8i16, res);
if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
// Use an AND to zero uppper bits for PACKUS.
In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
DAG.getIntPtrConstant(0, DL));
SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
DAG.getIntPtrConstant(8, DL));
return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
llvm_unreachable("All 256->128 cases should have been handled above!");
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
MVT VT = Op->getSimpleValueType(0);
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
MVT SrcVT = Src.getSimpleValueType();
SDLoc dl(Op);
if (VT.isVector()) {
if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
MVT TruncVT = MVT::v4i1;
unsigned Opc;
if (IsStrict)
Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
if (!IsSigned && !Subtarget.hasVLX()) {
assert(Subtarget.useAVX512Regs() && "Unexpected features!");
// Widen to 512-bits.
ResVT = MVT::v8i32;
TruncVT = MVT::v8i1;
Opc = Op.getOpcode();
// Need to concat with zero vector for strict fp to avoid spurious
// exceptions.
// TODO: Should we just do this for non-strict as well?
SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
: DAG.getUNDEF(MVT::v8f64);
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
DAG.getIntPtrConstant(0, dl));
SDValue Res, Chain;
if (IsStrict) {
Res =
DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(Opc, dl, ResVT, Src);
Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, dl);
return Res;
// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
assert(!IsSigned && "Expected unsigned conversion!");
assert(Subtarget.useAVX512Regs() && "Requires avx512f");
return Op;
// Widen vXi32 fp_to_uint with avx512f to 512-bit source.
if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
(SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
assert(!IsSigned && "Expected unsigned conversion!");
assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!");
MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
// Need to concat with zero vector for strict fp to avoid spurious
// exceptions.
// TODO: Should we just do this for non-strict as well?
SDValue Tmp =
IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
DAG.getIntPtrConstant(0, dl));
SDValue Res, Chain;
if (IsStrict) {
Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
{Op->getOperand(0), Src});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, dl);
return Res;
// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
(SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!");
MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
// Need to concat with zero vector for strict fp to avoid spurious
// exceptions.
// TODO: Should we just do this for non-strict as well?
SDValue Tmp =
IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
DAG.getIntPtrConstant(0, dl));
SDValue Res, Chain;
if (IsStrict) {
Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
{Op->getOperand(0), Src});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, dl);
return Res;
if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
if (!Subtarget.hasVLX()) {
// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
// legalizer and then widened again by vector op legalization.
if (!IsStrict)
return SDValue();
SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
{Src, Zero, Zero, Zero});
Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
{Op->getOperand(0), Tmp});
SDValue Chain = Tmp.getValue(1);
Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Tmp, Chain}, dl);
return Tmp;
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
if (IsStrict) {
unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
return DAG.getNode(Opc, dl, VT, Tmp);
return SDValue();
bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
if (!IsSigned && UseSSEReg) {
// Conversions from f32/f64 with AVX512 should be legal.
if (Subtarget.hasAVX512())
return Op;
// Use default expansion for i64.
if (VT == MVT::i64)
return SDValue();
assert(VT == MVT::i32 && "Unexpected VT!");
// Promote i32 to i64 and use a signed operation on 64-bit targets.
// FIXME: This does not generate an invalid exception if the input does not
// fit in i32. PR44019
if (Subtarget.is64Bit()) {
SDValue Res, Chain;
if (IsStrict) {
Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
{ Op.getOperand(0), Src });
Chain = Res.getValue(1);
} else
Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
if (IsStrict)
return DAG.getMergeValues({ Res, Chain }, dl);
return Res;
// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
// use fisttp which will be handled later.
if (!Subtarget.hasSSE3())
return SDValue();
// Promote i16 to i32 if we can use a SSE operation or the type is f128.
// FIXME: This does not generate an invalid exception if the input does not
// fit in i16. PR44019
if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
SDValue Res, Chain;
if (IsStrict) {
Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
{ Op.getOperand(0), Src });
Chain = Res.getValue(1);
} else
Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
if (IsStrict)
return DAG.getMergeValues({ Res, Chain }, dl);
return Res;
// If this is a FP_TO_SINT using SSEReg we're done.
if (UseSSEReg && IsSigned)
return Op;
// fp128 needs to use a libcall.
if (SrcVT == MVT::f128) {
RTLIB::Libcall LC;
if (IsSigned)
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
MakeLibCallOptions CallOptions;
std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
SDLoc(Op), Chain);
if (IsStrict)
return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
return Tmp.first;
// Fall back to X87.
SDValue Chain;
if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
if (IsStrict)
return DAG.getMergeValues({V, Chain}, dl);
return V;
llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
// If the source is in an SSE register, the node is Legal.
if (isScalarFPTypeInSSEReg(SrcVT))
return Op;
return LRINT_LLRINTHelper(Op.getNode(), DAG);
SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
SelectionDAG &DAG) const {
EVT DstVT = N->getValueType(0);
SDValue Src = N->getOperand(0);
EVT SrcVT = Src.getValueType();
if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
// f16 must be promoted before using the lowering in this routine.
// fp128 does not use this lowering.
return SDValue();
SDLoc DL(N);
SDValue Chain = DAG.getEntryNode();
bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
// If we're converting from SSE, the stack slot needs to hold both types.
// Otherwise it only needs to hold the DstVT.
EVT OtherVT = UseSSE ? SrcVT : DstVT;
SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
if (UseSSE) {
assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Chain, StackPtr };
Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
/*Align*/ None, MachineMemOperand::MOLoad);
Chain = Src.getValue(1);
SDValue StoreOps[] = { Chain, Src, StackPtr };
Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
StoreOps, DstVT, MPI, /*Align*/ None,
return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
if (VT == MVT::f128) {
return LowerF128Call(Op, DAG, LC);
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
SDValue Res =
if (IsStrict)
return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
{Op->getOperand(0), Res});
return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
// It's legal except when f128 is involved
if (SVT != MVT::f128)
return Op;
// FP_ROUND node has a second operand indicating whether it is known to be
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
SDLoc dl(Op);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
MakeLibCallOptions CallOptions;
std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
dl, Chain);
if (IsStrict)
return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
return Tmp.first;
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
bool IsStrict = Op->isStrictFPOpcode();
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
"Unexpected VT!");
SDLoc dl(Op);
SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
DAG.getConstant(0, dl, MVT::v8i16), Src,
DAG.getIntPtrConstant(0, dl));
SDValue Chain;
if (IsStrict) {
Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
{Op.getOperand(0), Res});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, dl);
return Res;
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
bool IsStrict = Op->isStrictFPOpcode();
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
"Unexpected VT!");
SDLoc dl(Op);
SDValue Res, Chain;
if (IsStrict) {
Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
DAG.getConstantFP(0, dl, MVT::v4f32), Src,
DAG.getIntPtrConstant(0, dl));
Res = DAG.getNode(
X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
Chain = Res.getValue(1);
} else {
// FIXME: Should we use zeros for upper elements for non-strict?
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
DAG.getTargetConstant(4, dl, MVT::i32));
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
DAG.getIntPtrConstant(0, dl));
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, dl);
return Res;
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// If both operands have other uses, this is probably not profitable.
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
if (!LHS.hasOneUse() && !RHS.hasOneUse())
return Op;
// FP horizontal add/sub were added with SSE3. Integer with SSSE3.
bool IsFP = Op.getSimpleValueType().isFloatingPoint();
if (IsFP && !Subtarget.hasSSE3())
return Op;
if (!IsFP && !Subtarget.hasSSSE3())
return Op;
// Extract from a common vector.
if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
LHS.getOperand(0) != RHS.getOperand(0) ||
!isa<ConstantSDNode>(LHS.getOperand(1)) ||
!isa<ConstantSDNode>(RHS.getOperand(1)) ||
!shouldUseHorizontalOp(true, DAG, Subtarget))
return Op;
// Allow commuted 'hadd' ops.
// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
unsigned HOpcode;
switch (Op.getOpcode()) {
case ISD::ADD: HOpcode = X86ISD::HADD; break;
case ISD::SUB: HOpcode = X86ISD::HSUB; break;
case ISD::FADD: HOpcode = X86ISD::FHADD; break;
case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
unsigned LExtIndex = LHS.getConstantOperandVal(1);
unsigned RExtIndex = RHS.getConstantOperandVal(1);
if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
std::swap(LExtIndex, RExtIndex);
if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
return Op;
SDValue X = LHS.getOperand(0);
EVT VecVT = X.getValueType();
unsigned BitWidth = VecVT.getSizeInBits();
unsigned NumLanes = BitWidth / 128;
unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here");
// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
SDLoc DL(Op);
if (BitWidth == 256 || BitWidth == 512) {
unsigned LaneIdx = LExtIndex / NumEltsPerLane;
X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
LExtIndex %= NumEltsPerLane;
// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
DAG.getIntPtrConstant(LExtIndex / 2, DL));
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
"Only expecting float/double");
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
/// This mode isn't supported in hardware on X86. But as long as we aren't
/// compiling with trapping math, we can emulate this with
/// floor(X + copysign(nextafter(0.5, 0.0), X)).
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
// N0 += copysign(nextafter(0.5, 0.0), N0)
const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
bool Ignored;
APFloat Point5Pred = APFloat(0.5f);
Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);*nextDown*/true);
SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
DAG.getConstantFP(Point5Pred, dl, VT), N0);
N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
// Truncate the result to remove fraction.
return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
/// The only differences between FABS and FNEG are the mask and the logic op.
/// FNEG also has a folding opportunity for FNEG(FABS(x)).
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
"Wrong opcode for lowering FABS or FNEG.");
bool IsFABS = (Op.getOpcode() == ISD::FABS);
// If this is a FABS and it has an FNEG user, bail out to fold the combination
// into an FNABS. We'll lower the FABS after that if it is still in use.
if (IsFABS)
for (SDNode *User : Op->uses())
if (User->getOpcode() == ISD::FNEG)
return Op;
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
bool IsF128 = (VT == MVT::f128);
assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
"Unexpected type in LowerFABSorFNEG");
// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
// decide if we should generate a 16-byte constant mask when we only need 4 or
// 8 bytes for the scalar case.
// There are no scalar bitwise logical SSE/AVX instructions, so we
// generate a 16-byte vector constant and logic op even for the scalar case.
// Using a 16-byte mask allows folding the load of the mask with
// the logic op, so it can save (~4 bytes) on code size.
bool IsFakeVector = !VT.isVector() && !IsF128;
MVT LogicVT = VT;
if (IsFakeVector)
LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
unsigned EltBits = VT.getScalarSizeInBits();
// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
SDValue Op0 = Op.getOperand(0);
bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
unsigned LogicOp = IsFABS ? X86ISD::FAND :
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
if (VT.isVector() || IsF128)
return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
// For the scalar case extend to a 128-bit vector, perform the logic op,
// and extract the scalar result back out.
Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
DAG.getIntPtrConstant(0, dl));
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue Mag = Op.getOperand(0);
SDValue Sign = Op.getOperand(1);
SDLoc dl(Op);
// If the sign operand is smaller, extend it first.
MVT VT = Op.getSimpleValueType();
if (Sign.getSimpleValueType().bitsLT(VT))
Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
// And if it is bigger, shrink it first.
if (Sign.getSimpleValueType().bitsGT(VT))
Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
bool IsF128 = (VT == MVT::f128);
assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
"Unexpected type in LowerFCOPYSIGN");
const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
// Perform all scalar logic operations as 16-byte vectors because there are no
// scalar FP logic instructions in SSE.
// TODO: This isn't necessary. If we used scalar types, we might avoid some
// unnecessary splats, but we might miss load folding opportunities. Should
// this decision be based on OptimizeForSize?
bool IsFakeVector = !VT.isVector() && !IsF128;
MVT LogicVT = VT;
if (IsFakeVector)
LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
// The mask constants are automatically splatted for vector types.
unsigned EltSizeInBits = VT.getScalarSizeInBits();
SDValue SignMask = DAG.getConstantFP(
APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
SDValue MagMask = DAG.getConstantFP(
APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
// First, clear all bits but the sign bit from the second operand (sign).
if (IsFakeVector)
Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
// Next, clear the sign bit from the first operand (magnitude).
// TODO: If we had general constant folding for FP logic ops, this check
// wouldn't be necessary.
SDValue MagBits;
if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
APFloat APF = Op0CN->getValueAPF();
MagBits = DAG.getConstantFP(APF, dl, LogicVT);
} else {
// If the magnitude operand wasn't a constant, we need to AND out the sign.
if (IsFakeVector)
Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
// OR the magnitude value with the sign bit.
SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
DAG.getIntPtrConstant(0, dl));
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT OpVT = N0.getSimpleValueType();
assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
"Unexpected type for FGETSIGN");
// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
Res = DAG.getZExtOrTrunc(Res, dl, VT);
Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
return Res;
/// Helper for creating a X86ISD::SETCC node.
static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
SelectionDAG &DAG) {
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
/// style scalarized (associative) reduction patterns. Partial reductions
/// are supported when the pointer SrcMask is non-null.
/// TODO - move this to SelectionDAG?
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
SmallVectorImpl<SDValue> &SrcOps,
SmallVectorImpl<APInt> *SrcMask = nullptr) {
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
// Recognize a special case where a vector is casted into wide integer to
// test all 0s.
assert(Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode");
for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
// BFS traverse all BinOp operands.
if (I->getOpcode() == unsigned(BinOp)) {
// Re-evaluate the number of nodes to be traversed.
e += 2; // 2 more nodes (LHS and RHS) are pushed.
// Quit if a non-EXTRACT_VECTOR_ELT
if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return false;
// Quit if without a constant index.
auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
if (!Idx)
return false;
SDValue Src = I->getOperand(0);
DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
if (M == SrcOpMap.end()) {
VT = Src.getValueType();
// Quit if not the same type.
if (SrcOpMap.begin() != SrcOpMap.end() &&
VT != SrcOpMap.begin()->first.getValueType())
return false;
unsigned NumElts = VT.getVectorNumElements();
APInt EltCount = APInt::getNullValue(NumElts);
M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
// Quit if element already used.
unsigned CIdx = Idx->getZExtValue();
if (M->second[CIdx])
return false;
if (SrcMask) {
// Collect the source partial masks.
for (SDValue &SrcOp : SrcOps)
} else {
// Quit if not all elements are used.
for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
E = SrcOpMap.end();
I != E; ++I) {
if (!I->second.isAllOnesValue())
return false;
return true;
// Helper function for comparing all bits of a vector against zero.
static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
const APInt &Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, X86::CondCode &X86CC) {
EVT VT = V.getValueType();
assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&
"Element Mask vs Vector bitwidth mismatch");
assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
auto MaskBits = [&](SDValue Src) {
if (Mask.isAllOnesValue())
return Src;
EVT SrcVT = Src.getValueType();
SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
// For sub-128-bit vector, cast to (legal) integer and compare with zero.
if (VT.getSizeInBits() < 128) {
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
return SDValue();
return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
DAG.getBitcast(IntVT, MaskBits(V)),
DAG.getConstant(0, DL, IntVT));
// Quit if not splittable to 128/256-bit vector.
if (!isPowerOf2_32(VT.getSizeInBits()))
return SDValue();
// Split down to 128/256-bit vector.
unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
while (VT.getSizeInBits() > TestSize) {
auto Split = DAG.SplitVector(V, DL);
VT = Split.first.getValueType();
V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
bool UsePTEST = Subtarget.hasSSE41();
if (UsePTEST) {
MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
V = DAG.getBitcast(TestVT, MaskBits(V));
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
// Without PTEST, a masked v2i64 or-reduction is not faster than
// scalarization.
if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
return SDValue();
V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
DAG.getConstant(0xFFFF, DL, MVT::i32));
// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &X86CC) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
if (!Subtarget.hasSSE2() || !Op->hasOneUse())
return SDValue();
// Check whether we're masking/truncating an OR-reduction result, in which
// case track the masked bits.
APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
switch (Op.getOpcode()) {
SDValue Src = Op.getOperand(0);
Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
Op = Src;
case ISD::AND: {
if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
Mask = Cst->getAPIntValue();
Op = Op.getOperand(0);
SmallVector<SDValue, 8> VecIns;
if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
EVT VT = VecIns[0].getValueType();
[VT](SDValue V) { return VT == V.getValueType(); }) &&
"Reduction source vector mismatch");
// Quit if less than 128-bits or not splittable to 128/256-bit vector.
if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
return SDValue();
// If more than one full vector is evaluated, OR them first before PTEST.
for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
Slot += 2, e += 1) {
// Each iteration will OR 2 nodes and append the result until there is
// only 1 node left, i.e. the final OR'd value of all vectors.
SDValue LHS = VecIns[Slot];
SDValue RHS = VecIns[Slot + 1];
VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
X86::CondCode CCode;
if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
DAG, CCode)) {
X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
return V;
if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
ISD::NodeType BinOp;
if (SDValue Match =
DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
X86::CondCode CCode;
if (SDValue V =
LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
return V;
return SDValue();
/// return true if \c Op has a use that doesn't just read flags.
static bool hasNonFlagsUse(SDValue Op) {
for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
++UI) {
SDNode *User = *UI;
unsigned UOpNo = UI.getOperandNo();
if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
// Look pass truncate.
UOpNo = User->use_begin().getOperandNo();
User = *User->use_begin();
if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
return true;
return false;
// Transform to an x86-specific ALU node with flags if there is a chance of
// using an RMW op or only the flags are used. Otherwise, leave
// the node alone and emit a 'cmp' or 'test' instruction.
static bool isProfitableToUseFlagOp(SDValue Op) {
for (SDNode *U : Op->uses())
if (U->getOpcode() != ISD::CopyToReg &&
U->getOpcode() != ISD::SETCC &&
U->getOpcode() != ISD::STORE)
return false;
return true;
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// CF and OF aren't always set the way we want. Determine which
// of these we need.
bool NeedCF = false;
bool NeedOF = false;
switch (X86CC) {
default: break;
case X86::COND_A: case X86::COND_AE:
case X86::COND_B: case X86::COND_BE:
NeedCF = true;
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
case X86::COND_O: case X86::COND_NO: {
// Check if we really need to set the
// Overflow flag. If NoSignedWrap is present
// that is not actually needed.
switch (Op->getOpcode()) {
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
case ISD::SHL:
if (Op.getNode()->getFlags().hasNoSignedWrap())
NeedOF = true;
// See if we can use the EFLAGS value from the operand instead of
// doing a separate TEST. TEST always sets OF and CF to 0, so unless
// we prove that the arithmetic won't overflow, we can't use OF or CF.
if (Op.getResNo() != 0 || NeedOF || NeedCF) {
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
unsigned Opcode = 0;
unsigned NumOperands = 0;
SDValue ArithOp = Op;
// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
// which may be the result of a CAST. We use the variable 'Op', which is the
// non-casted variable when we check for possible users.
switch (ArithOp.getOpcode()) {
case ISD::AND:
// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
// because a TEST instruction will be better.
if (!hasNonFlagsUse(Op))
case ISD::ADD:
case ISD::SUB:
case ISD::OR:
case ISD::XOR:
if (!isProfitableToUseFlagOp(Op))
// Otherwise use a regular EFLAGS-setting instruction.
switch (ArithOp.getOpcode()) {
default: llvm_unreachable("unexpected operator!");
case ISD::ADD: Opcode = X86ISD::ADD; break;
case ISD::SUB: Opcode = X86ISD::SUB; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
case ISD::OR: Opcode = X86ISD::OR; break;
NumOperands = 2;
case X86ISD::ADD:
case X86ISD::SUB:
case X86ISD::OR:
case X86ISD::XOR:
case X86ISD::AND:
return SDValue(Op.getNode(), 1);
case ISD::SSUBO:
case ISD::USUBO: {
// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
if (Opcode == 0) {
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, Op.getValueType()));
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
return SDValue(New.getNode(), 1);
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
const SDLoc &dl, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (isNullConstant(Op1))
return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
EVT CmpVT = Op0.getValueType();
assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
// Only promote the compare up to I32 if it is a 16 bit operation
// with an immediate. 16 bit immediates are to be avoided.
if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
!DAG.getMachineFunction().getFunction().hasMinSize()) {
ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
// Don't do this if the immediate can fit in 8-bits.
if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
unsigned ExtendOp =
if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
// For equality comparisons try to use SIGN_EXTEND if the input was
// truncate from something with enough sign bits.
if (Op0.getOpcode() == ISD::TRUNCATE) {
SDValue In = Op0.getOperand(0);
unsigned EffBits =
In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
if (EffBits <= 16)
} else if (Op1.getOpcode() == ISD::TRUNCATE) {
SDValue In = Op1.getOperand(0);
unsigned EffBits =
In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
if (EffBits <= 16)
CmpVT = MVT::i32;
Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
// Try to shrink i64 compares if the input has enough zero bits.
// FIXME: Do this for non-constant compares for constant on LHS?
if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
CmpVT = MVT::i32;
Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
// 0-x == y --> x+y == 0
// 0-x != y --> x+y != 0
if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
return Add.getValue(1);
// x == 0-y --> x+y == 0
// x != 0-y --> x+y != 0
if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
return Add.getValue(1);
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
return Sub.getValue(1);
/// Check if replacement of SQRT with RSQRT should be disabled.
bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
// We never want to use both SQRT and RSQRT instructions for the same input.
if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
return false;
if (VT.isVector())
return Subtarget.hasFastVectorFSQRT();
return Subtarget.hasFastScalarFSQRT();
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
SelectionDAG &DAG, int Enabled,
int &RefinementSteps,
bool &UseOneConstNR,
bool Reciprocal) const {
EVT VT = Op.getValueType();
// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
// It is likely not profitable to do this for f64 because a double-precision
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
// after legalize types.
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
(VT == MVT::v8f32 && Subtarget.hasAVX()) ||
(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
UseOneConstNR = false;
// There is no FSQRT for 512-bits, but there is RSQRT14.
unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
return SDValue();
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
int Enabled,
int &RefinementSteps) const {
EVT VT = Op.getValueType();
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
// It is likely not profitable to do this for f64 because a double-precision
// reciprocal estimate with refinement on x86 prior to FMA requires
// 15 instructions: convert to single, rcpss, convert back to double, refine
// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
(VT == MVT::v8f32 && Subtarget.hasAVX()) ||
(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
// Enable estimate codegen with 1 refinement step for vector division.
// Scalar division estimates are disabled because they break too much
// real-world code. These defaults are intended to match GCC behavior.
if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
return SDValue();
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
// There is no FSQRT for 512-bits, but there is RCP14.
unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
return SDValue();
/// If we have at least two divisions that use the same divisor, convert to
/// multiplication by a reciprocal. This may need to be adjusted for a given
/// CPU if a division's cost is not at least twice the cost of a multiplication.
/// This is because we still need one division to calculate the reciprocal and
/// then we need two multiplies by that reciprocal as replacements for the
/// original divisions.
unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
return 2;
X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
SmallVectorImpl<SDNode *> &Created) const {
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (isIntDivCheap(N->getValueType(0), Attr))
return SDValue(N,0); // Lower SDIV as SDIV
assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!");
// Only perform this transform if CMOV is supported otherwise the select
// below will become a branch.
if (!Subtarget.hasCMov())
return SDValue();
// fold (sdiv X, pow2)
EVT VT = N->getValueType(0);
// FIXME: Support i8.
if (VT != MVT::i16 && VT != MVT::i32 &&
!(Subtarget.is64Bit() && VT == MVT::i64))
return SDValue();
unsigned Lg2 = Divisor.countTrailingZeros();
// If the divisor is 2 or -2, the default expansion is better.
if (Lg2 == 1)
return SDValue();
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
SDValue Zero = DAG.getConstant(0, DL, VT);
APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
// Divide by pow2.
SDValue SRA =
DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
// If we're dividing by a positive value, we're done. Otherwise, we must
// negate the result.
if (Divisor.isNonNegative())
return SRA;
return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
/// Result of 'and' is compared against zero. Change to a BT node if possible.
/// Returns the BT node and the condition code needed to use it.
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG,
SDValue &X86CC) {
assert(And.getOpcode() == ISD::AND && "Expected AND node!");
SDValue Op0 = And.getOperand(0);
SDValue Op1 = And.getOperand(1);
if (Op0.getOpcode() == ISD::TRUNCATE)
Op0 = Op0.getOperand(0);
if (Op1.getOpcode() == ISD::TRUNCATE)
Op1 = Op1.getOperand(0);
SDValue Src, BitNo;
if (Op1.getOpcode() == ISD::SHL)
std::swap(Op0, Op1);
if (Op0.getOpcode() == ISD::SHL) {
if (isOneConstant(Op0.getOperand(0))) {
// If we looked past a truncate, check that it's only truncating away
// known zeros.
unsigned BitWidth = Op0.getValueSizeInBits();
unsigned AndBitWidth = And.getValueSizeInBits();
if (BitWidth > AndBitWidth) {
KnownBits Known = DAG.computeKnownBits(Op0);
if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
return SDValue();
Src = Op1;
BitNo = Op0.getOperand(1);
} else if (Op1.getOpcode() == ISD::Constant) {
ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
uint64_t AndRHSVal = AndRHS->getZExtValue();
SDValue AndLHS = Op0;
if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
Src = AndLHS.getOperand(0);
BitNo = AndLHS.getOperand(1);
} else {
// Use BT if the immediate can't be encoded in a TEST instruction or we
// are optimizing for size and the immedaite won't fit in a byte.
bool OptForSize = DAG.shouldOptForSize();
if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
isPowerOf2_64(AndRHSVal)) {
Src = AndLHS;
BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
// No patterns found, give up.
if (!Src.getNode())
return SDValue();
// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
// instruction. Since the shift amount is in-range-or-undefined, we know
// that doing a bittest on the i32 value is ok. We extend to i32 because
// the encoding for the i16 version is larger than the i32 version.
// Also promote i16 to i32 for performance / code size reason.
if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
// See if we can use the 32-bit instruction instead of the 64-bit one for a
// shorter encoding. Since the former takes the modulo 32 of BitNo and the
// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
// known to be zero.
if (Src.getValueType() == MVT::i64 &&
DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
// If the operand types disagree, extend the shift amount to match. Since
// BT ignores high bits (like shifts) we can use anyextend.
if (Src.getValueType() != BitNo.getValueType())
BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
dl, MVT::i8);
return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
/// CMPs.
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
SDValue &Op1, bool &IsAlwaysSignaling) {
unsigned SSECC;
bool Swap = false;
// SSE Condition code mapping:
// 0 - EQ
// 1 - LT
// 2 - LE
// 3 - UNORD
// 4 - NEQ
// 5 - NLT
// 6 - NLE
// 7 - ORD
switch (SetCCOpcode) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETEQ: SSECC = 0; break;
case ISD::SETLT:
case ISD::SETOLT: SSECC = 1; break;
case ISD::SETLE:
case ISD::SETOLE: SSECC = 2; break;
case ISD::SETUO: SSECC = 3; break;
case ISD::SETNE: SSECC = 4; break;
case ISD::SETUGE: SSECC = 5; break;
case ISD::SETUGT: SSECC = 6; break;
case ISD::SETO: SSECC = 7; break;
case ISD::SETUEQ: SSECC = 8; break;
case ISD::SETONE: SSECC = 12; break;
if (Swap)
std::swap(Op0, Op1);
switch (SetCCOpcode) {
IsAlwaysSignaling = true;
case ISD::SETEQ:
case ISD::SETNE:
case ISD::SETO:
case ISD::SETUO:
IsAlwaysSignaling = false;
return SSECC;
/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
/// concatenate the result back.
static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");
assert(Op.getOperand(0).getValueType().isInteger() &&
VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");
SDLoc dl(Op);
SDValue CC = Op.getOperand(2);
// Extract the LHS Lo/Hi vectors
SDValue LHS1, LHS2;
std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
// Extract the RHS Lo/Hi vectors
SDValue RHS1, RHS2;
std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
assert(VT.getVectorElementType() == MVT::i1 &&
"Cannot set masked compare for this operation");
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
// Prefer SETGT over SETLT.
if (SetCCOpcode == ISD::SETLT) {
SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
std::swap(Op0, Op1);
return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
/// Given a buildvector constant, return a new vector constant with each element
/// incremented or decremented. If incrementing or decrementing would result in
/// unsigned overflow or underflow or this is not a simple vector constant,
/// return an empty value.
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
if (!BV)
return SDValue();
MVT VT = V.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
SmallVector<SDValue, 8> NewVecC;
SDLoc DL(V);
for (unsigned i = 0; i < NumElts; ++i) {
auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
return SDValue();
// Avoid overflow/underflow.
const APInt &EltC = Elt->getAPIntValue();
if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
return SDValue();
NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
return DAG.getBuildVector(VT, DL, NewVecC);
/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
/// Op0 u<= Op1:
/// t = psubus Op0, Op1
/// pcmpeq t, <0..0>
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
ISD::CondCode Cond, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (!Subtarget.hasSSE2())
return SDValue();
MVT VET = VT.getVectorElementType();
if (VET != MVT::i8 && VET != MVT::i16)
return SDValue();
switch (Cond) {
return SDValue();
case ISD::SETULT: {
// If the comparison is against a constant we can turn this into a
// setule. With psubus, setule does not require a swap. This is
// beneficial because the constant in the register is no longer
// destructed as the destination so it can be hoisted out of a loop.
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget.hasAVX())
return SDValue();
SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
if (!ULEOp1)
return SDValue();
Op1 = ULEOp1;
case ISD::SETUGT: {
// If the comparison is against a constant, we can turn this into a setuge.
// This is beneficial because materializing a constant 0 for the PCMPEQ is
// probably cheaper than XOR+PCMPGT using 2 different vector constants:
// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
if (!UGEOp1)
return SDValue();
Op1 = Op0;
Op0 = UGEOp1;
// Psubus is better than flip-sign because it requires no inversion.
std::swap(Op0, Op1);
SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
DAG.getConstant(0, dl, VT));
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
Op.getOpcode() == ISD::STRICT_FSETCCS;
SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
MVT VT = Op->getSimpleValueType(0);
ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
bool isFP = Op1.getSimpleValueType().isFloatingPoint();
SDLoc dl(Op);
if (isFP) {
#ifndef NDEBUG
MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
// If we have a strict compare with a vXi1 result and the input is 128/256
// bits we can't use a masked compare unless we have VLX. If we use a wider
// compare like we do for non-strict, we might trigger spurious exceptions
// from the upper elements. Instead emit a AVX compare and convert to mask.
unsigned Opc;
if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
(!IsStrict || Subtarget.hasVLX() ||
Op0.getSimpleValueType().is512BitVector())) {
assert(VT.getVectorNumElements() <= 16);
Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
} else {
Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
// The SSE/AVX packed FP comparison nodes are defined with a
// floating-point vector result that matches the operand type. This allows
// them to work with an SSE1 target (integer vector types are not legal).
VT = Op0.getSimpleValueType();
SDValue Cmp;
bool IsAlwaysSignaling;
unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
if (!Subtarget.hasAVX()) {
// TODO: We could use following steps to handle a quiet compare with
// signaling encodings.
// 1. Get ordered masks from a quiet ISD::SETO
// 2. Use the masks to mask potential unordered elements in operand A, B
// 3. Get the compare results of masked A, B
// 4. Calculating final result using the mask and result from 3
// But currently, we just fall back to scalar operations.
if (IsStrict && IsAlwaysSignaling && !IsSignaling)
return SDValue();
// Insert an extra signaling instruction to raise exception.
if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
SDValue SignalCmp = DAG.getNode(
Opc, dl, {VT, MVT::Other},
{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
// FIXME: It seems we need to update the flags of all new strict nodes.
// Otherwise, mayRaiseFPException in MI will return false due to
// NoFPExcept = false by default. However, I didn't find it in other
// patches.
Chain = SignalCmp.getValue(1);
// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
// emit two comparisons and a logic op to tie them together.
if (SSECC >= 8) {
// LLVM predicate is SETUEQ or SETONE.
unsigned CC0, CC1;
unsigned CombineOpc;
if (Cond == ISD::SETUEQ) {
CC0 = 3; // UNORD
CC1 = 0; // EQ
CombineOpc = X86ISD::FOR;
} else {
assert(Cond == ISD::SETONE);
CC0 = 7; // ORD
CC1 = 4; // NEQ
CombineOpc = X86ISD::FAND;
SDValue Cmp0, Cmp1;
if (IsStrict) {
Cmp0 = DAG.getNode(
Opc, dl, {VT, MVT::Other},
{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
Cmp1 = DAG.getNode(
Opc, dl, {VT, MVT::Other},
{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
} else {
Cmp0 = DAG.getNode(
Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
Cmp1 = DAG.getNode(
Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
if (IsStrict) {
Cmp = DAG.getNode(
Opc, dl, {VT, MVT::Other},
{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
Chain = Cmp.getValue(1);
} else
Cmp = DAG.getNode(
Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
} else {
// Handle all other FP comparisons here.
if (IsStrict) {
// Make a flip on already signaling CCs before setting bit 4 of AVX CC.
SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
Cmp = DAG.getNode(
Opc, dl, {VT, MVT::Other},
{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
Chain = Cmp.getValue(1);
} else
Cmp = DAG.getNode(
Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
// We emitted a compare with an XMM/YMM result. Finish converting to a
// mask register using a vptestm.
EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
Cmp = DAG.getBitcast(CastVT, Cmp);
Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
DAG.getConstant(0, dl, CastVT), ISD::SETNE);
} else {
// If this is SSE/AVX CMPP, bitcast the result back to integer to match
// the result type of SETCC. The bitcast is expected to be optimized
// away during combining/isel.
Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
if (IsStrict)
return DAG.getMergeValues({Cmp, Chain}, dl);
return Cmp;
assert(!IsStrict && "Strict SETCC only handles FP operands.");
MVT VTOp0 = Op0.getSimpleValueType();
assert(VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!");
assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!");
// The non-AVX512 code below works under the assumption that source and
// destination types are the same.
assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
"Value types for source and destination must be the same!");
// The result is boolean, but operands are int/float
if (VT.getVectorElementType() == MVT::i1) {
// In AVX-512 architecture setcc returns mask with i1 elements,
// But there is no compare instruction for i8 and i16 elements in KNL.
assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
"Unexpected operand type");
return LowerIntVSETCC_AVX512(Op, DAG);
// Lower using XOP integer comparisons.
if (VT.is128BitVector() && Subtarget.hasXOP()) {
// Translate compare code to XOP PCOM compare mode.
unsigned CmpMode = 0;
switch (Cond) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETLT: CmpMode = 0x00; break;
case ISD::SETLE: CmpMode = 0x01; break;
case ISD::SETGT: CmpMode = 0x02; break;
case ISD::SETGE: CmpMode = 0x03; break;
case ISD::SETEQ: CmpMode = 0x04; break;
case ISD::SETNE: CmpMode = 0x05; break;
// Are we comparing unsigned or signed integers?
unsigned Opc =
ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
return DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getTargetConstant(CmpMode, dl, MVT::i8));
// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
SDValue BC0 = peekThroughBitcasts(Op0);
if (BC0.getOpcode() == ISD::AND) {
APInt UndefElts;
SmallVector<APInt, 64> EltBits;
if (getTargetConstantBitsFromNode(BC0.getOperand(1),
VT.getScalarSizeInBits(), UndefElts,
EltBits, false, false)) {
if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
Cond = ISD::SETEQ;
Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
ConstantSDNode *C1 = isConstOrConstSplat(Op1);
if (C1 && C1->getAPIntValue().isPowerOf2()) {
unsigned BitWidth = VT.getScalarSizeInBits();
unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
SDValue Result = Op0.getOperand(0);
Result = DAG.getNode(ISD::SHL, dl, VT, Result,
DAG.getConstant(ShiftAmt, dl, VT));
Result = DAG.getNode(ISD::SRA, dl, VT, Result,
DAG.getConstant(BitWidth - 1, dl, VT));
return Result;
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitIntVSETCC(Op, DAG);
if (VT == MVT::v32i16 || VT == MVT::v64i8) {
assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
return splitIntVSETCC(Op, DAG);
// If this is a SETNE against the signed minimum value, change it to SETGT.
// If this is a SETNE against the signed maximum value, change it to SETLT.
// which will be swapped to SETGT.
// Otherwise we use PCMPEQ+invert.
APInt ConstValue;
if (Cond == ISD::SETNE &&
ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
if (ConstValue.isMinSignedValue())
Cond = ISD::SETGT;
else if (ConstValue.isMaxSignedValue())
Cond = ISD::SETLT;
// If both operands are known non-negative, then an unsigned compare is the
// same as a signed compare and there's no need to flip signbits.
// TODO: We could check for more general simplifications here since we're
// computing known bits.
bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
// Special case: Use min/max operations for unsigned compares.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (ISD::isUnsignedIntSetCC(Cond) &&
(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
TLI.isOperationLegal(ISD::UMIN, VT)) {
// If we have a constant operand, increment/decrement it and change the
// condition to avoid an invert.
if (Cond == ISD::SETUGT) {
// X > C --> X >= (C+1) --> X == umax(X, C+1)
if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
Op1 = UGTOp1;
if (Cond == ISD::SETULT) {
// X < C --> X <= (C-1) --> X == umin(X, C-1)
if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
Op1 = ULTOp1;
bool Invert = false;
unsigned Opc;
switch (Cond) {
default: llvm_unreachable("Unexpected condition code");
case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
case ISD::SETULE: Opc = ISD::UMIN; break;
case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
case ISD::SETUGE: Opc = ISD::UMAX; break;
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
// If the logical-not of the result is required, perform that now.
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
return Result;
// Try to use SUBUS and PCMPEQ.
if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
return V;
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
Cond == ISD::SETGE || Cond == ISD::SETUGE;
bool Invert = Cond == ISD::SETNE ||
(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
if (Swap)
std::swap(Op0, Op1);
// Check that the operation in question is available (most are plain SSE2,
// but PCMPGTQ and PCMPEQQ have different requirements).
if (VT == MVT::v2i64) {
if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
assert(Subtarget.hasSSE2() && "Don't know how to lower!");
// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
// the odd elements over the even elements.
if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
Op0 = DAG.getConstant(0, dl, MVT::v4i32);
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
static const int MaskHi[] = { 1, 1, 3, 3 };
SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
return DAG.getBitcast(VT, Result);
if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
static const int MaskHi[] = { 1, 1, 3, 3 };
SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
return DAG.getBitcast(VT, Result);
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations. The lower
// compare is always unsigned.
SDValue SB;
if (FlipSigns) {
SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
} else {
SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
// Cast everything to the right type.
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
// Create masks for only the low parts/high parts of the 64 bit integers.
static const int MaskHi[] = { 1, 1, 3, 3 };
static const int MaskLo[] = { 0, 0, 2, 2 };
SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
if (Invert)
Result = DAG.getNOT(dl, Result, MVT::v4i32);
return DAG.getBitcast(VT, Result);
if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
// pcmpeqd + pshufd + pand.
assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
// First cast everything to the right type.
Op0 = DAG.getBitcast(MVT::v4i32, Op0);
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
// Do the compare.
SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
// Make sure the lower and upper halves are both all-ones.
static const int Mask[] = { 1, 0, 3, 2 };
SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
if (Invert)
Result = DAG.getNOT(dl, Result, MVT::v4i32);
return DAG.getBitcast(VT, Result);
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations.
if (FlipSigns) {
MVT EltVT = VT.getVectorElementType();
SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
// If the logical-not of the result is required, perform that now.
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
return Result;
// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SDValue &X86CC) {
// Only support equality comparisons.
if (CC != ISD::SETEQ && CC != ISD::SETNE)
return SDValue();
// Must be a bitcast from vXi1.
if (Op0.getOpcode() != ISD::BITCAST)
return SDValue();
Op0 = Op0.getOperand(0);
MVT VT = Op0.getSimpleValueType();
if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
!(Subtarget.hasDQI() && VT == MVT::v8i1) &&
!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
return SDValue();
X86::CondCode X86Cond;
if (isNullConstant(Op1)) {
X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
} else if (isAllOnesConstant(Op1)) {
// C flag is set for all ones.
X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
} else
return SDValue();
// If the input is an AND, we can combine it's operands into the KTEST.
bool KTestable = false;
if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
KTestable = true;
if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
KTestable = true;
if (!isNullConstant(Op1))
KTestable = false;
if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
SDValue LHS = Op0.getOperand(0);
SDValue RHS = Op0.getOperand(1);
X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
// If the input is an OR, we can combine it's operands into the KORTEST.
SDValue LHS = Op0;
SDValue RHS = Op0;
if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
LHS = Op0.getOperand(0);
RHS = Op0.getOperand(1);
X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
/// Emit flags for the given setcc condition and operands. Also returns the
/// corresponding X86 condition code constant in X86CC.
SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
ISD::CondCode CC, const SDLoc &dl,
SelectionDAG &DAG,
SDValue &X86CC) const {
// Optimize to BT if possible.
// Lower (X & (1 << N)) == 0 to BT(X, N).
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
// Lower ((X >>s N) & 1) != 0 to BT(X, N).
if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
return BT;
// Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
// TODO: We could do AND tree with all 1s as well by using the C flag.
if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
if (SDValue CmpZ =
MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
return CmpZ;
// Try to lower using KORTEST or KTEST.
if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
return Test;
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// If the input is a setcc, then reuse the input setcc or use a new one with
// the inverted condition.
if (Op0.getOpcode() == X86ISD::SETCC) {
bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
X86CC = Op0.getOperand(0);
if (Invert) {
X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
return Op0.getOperand(1);
// Try to use the carry flag from the add in place of an separate CMP for:
// (seteq (add X, -1), -1). Similar for setne.
if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
if (isProfitableToUseFlagOp(Op0)) {
SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
return SDValue(New.getNode(), 1);
X86::CondCode CondCode =
TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
return EFLAGS;
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
Op.getOpcode() == ISD::STRICT_FSETCCS;
MVT VT = Op->getSimpleValueType(0);
if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
SDLoc dl(Op);
ISD::CondCode CC =
cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets handled by emitFlagsForSetcc.
if (Op0.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
Op.getOpcode() == ISD::STRICT_FSETCCS);
// If softenSetCCOperands returned a scalar, use it.
if (!Op1.getNode()) {
assert(Op0.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
if (IsStrict)
return DAG.getMergeValues({Op0, Chain}, dl);
return Op0;
if (Op0.getSimpleValueType().isInteger()) {
SDValue X86CC;
SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
// Handle floating point.
X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
if (CondCode == X86::COND_INVALID)
return SDValue();
if (IsStrict) {
bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
Chain = EFLAGS.getValue(1);
} else {
EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Carry = Op.getOperand(2);
SDValue Cond = Op.getOperand(3);
SDLoc DL(Op);
assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
// Recreate the carry if needed.
EVT CarryVT = Carry.getValueType();
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
Carry, DAG.getAllOnesConstant(DL, CarryVT));
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
return getSETCC(CC, Cmp.getValue(1), DL, DAG);
// This function returns three things: the arithmetic computation itself
// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
// flag and the condition code define the case in which the arithmetic
// computation overflows.
static std::pair<SDValue, SDValue>
getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
assert(Op.getResNo() == 0 && "Unexpected result number!");
SDValue Value, Overflow;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
unsigned BaseOp = 0;
SDLoc DL(Op);
switch (Op.getOpcode()) {
default: llvm_unreachable("Unknown ovf instruction!");
case ISD::SADDO:
BaseOp = X86ISD::ADD;
Cond = X86::COND_O;
case ISD::UADDO:
BaseOp = X86ISD::ADD;
Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
case ISD::SSUBO:
BaseOp = X86ISD::SUB;
Cond = X86::COND_O;
case ISD::USUBO:
BaseOp = X86ISD::SUB;
Cond = X86::COND_B;
case ISD::SMULO:
BaseOp = X86ISD::SMUL;
Cond = X86::COND_O;
case ISD::UMULO:
BaseOp = X86ISD::UMUL;
Cond = X86::COND_O;
if (BaseOp) {
// Also sets EFLAGS.
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
Overflow = Value.getValue(1);
return std::make_pair(Value, Overflow);
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
// looks for this combo and may remove the "setcc" instruction if the "setcc"
// has only one use.
SDLoc DL(Op);
X86::CondCode Cond;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
/// Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getOpcode();
if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
Opc == X86ISD::FCMP)
return true;
if (Op.getResNo() == 1 &&
(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
return true;
return false;
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
if (V.getOpcode() != ISD::TRUNCATE)
return false;
SDValue VOp0 = V.getOperand(0);
unsigned InBits = VOp0.getValueSizeInBits();
unsigned Bits = V.getValueSizeInBits();
return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
bool AddTest = true;
SDValue Cond = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
SDLoc DL(Op);
MVT VT = Op1.getSimpleValueType();
SDValue CC;
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
// are available or VBLENDV if AVX is available.
// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
bool IsAlwaysSignaling;
unsigned SSECC =
CondOp0, CondOp1, IsAlwaysSignaling);
if (Subtarget.hasAVX512()) {
SDValue Cmp =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
DAG.getTargetConstant(SSECC, DL, MVT::i8));
assert(!VT.isVector() && "Not a scalar type?");
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
if (SSECC < 8 || Subtarget.hasAVX()) {
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
DAG.getTargetConstant(SSECC, DL, MVT::i8));
// If we have AVX, we can use a variable vector select (VBLENDV) instead
// of 3 logic instructions for size savings and potentially speed.
// Unfortunately, there is no scalar form of VBLENDV.
// If either operand is a +0.0 constant, don't try this. We can expect to
// optimize away at least one of the logic instructions later in that
// case, so that sequence would be faster than a variable blend.
// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
// uses XMM0 as the selection register. That may need just as many
// instructions as the AND/ANDN/OR sequence due to register moves, so
// don't bother.
if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
!isNullFPConstant(Op2)) {
// Convert to vectors, do a VSELECT, and convert back to scalar.
// All of the conversions should be optimized away.
MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
VCmp = DAG.getBitcast(VCmpVT, VCmp);
SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
VSel, DAG.getIntPtrConstant(0, DL));
SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
// AVX512 fallback is to lower selects of scalar floats to masked moves.
if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
if (Cond.getOpcode() == ISD::SETCC) {
if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
// If the condition was updated, it's possible that the operands of the
// select were also updated (for example, EmitTest has a RAUW). Refresh
// the local references to the select operands in case they got stale.
Op1 = Op.getOperand(1);
Op2 = Op.getOperand(2);
// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
if (Cond.getOpcode() == X86ISD::SETCC &&
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
SDValue CmpOp0 = Cmp.getOperand(0);
unsigned CondCode = Cond.getConstantOperandVal(0);
// Special handling for __builtin_ffs(X) - 1 pattern which looks like
// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
// handle to keep the CMP with 0. This should be removed by
// optimizeCompareInst by using the flags from the BSR/TZCNT used for the
// cttz_zero_undef.
auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
// Keep Cmp.
} else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
// Apply further optimizations for special cases
// (select (x != 0), -1, 0) -> neg & sbb
// (select (x == 0), 0, -1) -> neg & sbb
if (isNullConstant(Y) &&
(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
Zero = DAG.getConstant(0, DL, Op.getValueType());
return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
SDValue Res = // Res = 0 or -1.
DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
Res = DAG.getNOT(DL, Res, Res.getValueType());
return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
Cmp.getOperand(0).getOpcode() == ISD::AND &&
isOneConstant(Cmp.getOperand(0).getOperand(1))) {
SDValue Src1, Src2;
// true if Op2 is XOR or OR operator and one of its operands
// is equal to Op1
// ( a , a op b) || ( b , a op b)
auto isOrXorPattern = [&]() {
if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
Src1 =
Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
Src2 = Op1;
return true;
return false;
if (isOrXorPattern()) {
SDValue Neg;
unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
// we need mask of all zeros or ones with same size of the other
// operands.
if (CmpSz > VT.getSizeInBits())
Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
else if (CmpSz < VT.getSizeInBits())
Neg = DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
DAG.getConstant(1, DL, VT));
Neg = CmpOp0;
SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
Neg); // -(and (x, 0x1))
SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
// Look past (and (setcc_carry (cmp ...)), 1).
if (Cond.getOpcode() == ISD::AND &&
Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
Cond = Cond.getOperand(0);
// If condition flag is set by a X86ISD::CMP, then use it as the condition
// setting operand in place of the X86ISD::SETCC.
unsigned CondOpcode = Cond.getOpcode();
if (CondOpcode == X86ISD::SETCC ||
CondOpcode == X86ISD::SETCC_CARRY) {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
!isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
Cmp.getOpcode() == X86ISD::BT) { // FIXME
Cond = Cmp;
AddTest = false;
} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
SDValue Value;
X86::CondCode X86Cond;
std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
AddTest = false;
if (AddTest) {
// Look past the truncate if the high bits are known zero.
if (isTruncWithZeroHighBitsInput(Cond, DAG))
Cond = Cond.getOperand(0);
// We know the result of AND is compared against zero. Try to match
// it to BT.
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
Cond = BT;
AddTest = false;
if (AddTest) {
CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
// a < b ? -1 : 0 -> RES = ~setcc_carry
// a < b ? 0 : -1 -> RES = setcc_carry
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
if (Cond.getOpcode() == X86ISD::SUB) {
unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(isNullConstant(Op1) || isNullConstant(Op2))) {
SDValue Res =
DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
return DAG.getNOT(DL, Res, Res.getValueType());
return Res;
// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
// widen the cmov and push the truncate through. This avoids introducing a new
// branch during isel and doesn't add any extensions.
if (Op.getValueType() == MVT::i8 &&
Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
if (T1.getValueType() == T2.getValueType() &&
// Exclude CopyFromReg to avoid partial register stalls.
T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
CC, Cond);
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
// Or finally, promote i8 cmovs if we have CMOV,
// or i16 cmovs if it won't prevent folding a load.
// FIXME: we should not limit promotion of i8 case to only when the CMOV is
// legal, but EmitLoweredSelect() can not deal with these extensions
// being inserted between two CMOV's. (in i16 case too TBN)
if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
!MayFoldLoad(Op2))) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
SDValue Ops[] = { Op2, Op1, CC, Cond };
SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
// condition is true.
SDValue Ops[] = { Op2, Op1, CC, Cond };
return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
MVT VTElt = VT.getVectorElementType();
SDLoc dl(Op);
unsigned NumElts = VT.getVectorNumElements();
// Extend VT if the scalar type is i8/i16 and BWI is not supported.
if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
// If v16i32 is to be avoided, we'll need to split and concatenate.
if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
// Widen to 512-bits if VLX is not supported.
MVT WideVT = ExtVT;
if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
NumElts *= 512 / ExtVT.getSizeInBits();
InVT = MVT::getVectorVT(MVT::i1, NumElts);
In, DAG.getIntPtrConstant(0, dl));
WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
SDValue V;
MVT WideEltVT = WideVT.getVectorElementType();
if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
} else {
SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
SDValue Zero = DAG.getConstant(0, dl, WideVT);
V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
// Truncate if we had to extend i16/i8 above.
if (VT != ExtVT) {
WideVT = MVT::getVectorVT(VTElt, NumElts);
V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
// Extract back to 128/256-bit if we widened.
if (WideVT != VT)
DAG.getIntPtrConstant(0, dl));
return V;
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
if (InVT.getVectorElementType() == MVT::i1)
return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
assert(Subtarget.hasAVX() && "Expected AVX support");
return LowerAVXExtend(Op, DAG, Subtarget);
// For sign extend this needs to handle all vector sizes and SSE4.1 and
// non-SSE4.1 targets. For zero extend this should only handle inputs of
// MVT::v64i8 when BWI is not supported, but AVX512 is.
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
MVT VT = Op->getSimpleValueType(0);
MVT InVT = In.getSimpleValueType();
MVT SVT = VT.getVectorElementType();
MVT InSVT = InVT.getVectorElementType();
assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
return SDValue();
if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
return SDValue();
if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
!(VT.is256BitVector() && Subtarget.hasAVX()) &&
!(VT.is512BitVector() && Subtarget.hasAVX512()))
return SDValue();
SDLoc dl(Op);
unsigned Opc = Op.getOpcode();
unsigned NumElts = VT.getVectorNumElements();
// For 256-bit vectors, we only need the lower (128-bit) half of the input.
// For 512-bit vectors, we need 128-bits or 256-bits.
if (InVT.getSizeInBits() > 128) {
// Input needs to be at least the same number of elements as output, and
// at least 128-bits.
int InSize = InSVT.getSizeInBits() * NumElts;
In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
InVT = In.getSimpleValueType();
// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
// need to be handled here for 256/512-bit results.
if (Subtarget.hasInt256()) {
assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
if (InVT.getVectorNumElements() != NumElts)
return DAG.getNode(Op.getOpcode(), dl, VT, In);
// FIXME: Apparently we create inreg operations that could be regular
// extends.
unsigned ExtOpc =
return DAG.getNode(ExtOpc, dl, VT, In);
// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
if (Subtarget.hasAVX()) {
assert(VT.is256BitVector() && "256-bit vector expected");
MVT HalfVT = VT.getHalfNumVectorElementsVT();
int HalfNumElts = HalfVT.getVectorNumElements();
unsigned NumSrcElts = InVT.getVectorNumElements();
SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
for (int i = 0; i != HalfNumElts; ++i)
HiMask[i] = HalfNumElts + i;
SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
// We should only get here for sign extend.
assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
SDValue Curr = In;
SDValue SignExt = Curr;
// As SRAI is only available on i16/i32 types, we expand only up to i32
// and handle i64 separately.
if (InVT != MVT::v4i32) {
MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
unsigned DestWidth = DestVT.getScalarSizeInBits();
unsigned Scale = DestWidth / InSVT.getSizeInBits();
unsigned InNumElts = InVT.getVectorNumElements();
unsigned DestElts = DestVT.getVectorNumElements();
// Build a shuffle mask that takes each input element and places it in the
// MSBs of the new element size.
SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
for (unsigned i = 0; i != DestElts; ++i)
Mask[i * Scale + (Scale - 1)] = i;
Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
Curr = DAG.getBitcast(DestVT, Curr);
unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
if (VT == MVT::v2i64) {
assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
SignExt = DAG.getBitcast(VT, SignExt);
return SignExt;
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
if (InVT.getVectorElementType() == MVT::i1)
return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::i32 ||
VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type");
assert((InVT.getVectorElementType() == MVT::i8 ||
InVT.getVectorElementType() == MVT::i16 ||
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
assert(InVT == MVT::v32i8 && "Unexpected VT!");
return splitVectorIntUnary(Op, DAG);
if (Subtarget.hasInt256())
return Op;
// Optimize vectors in AVX mode
// Sign extend v8i16 to v8i32 and
// v4i32 to v4i64
// Divide input vector into two parts
// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
// concat the vectors to original VT
MVT HalfVT = VT.getHalfNumVectorElementsVT();
SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
unsigned NumElems = InVT.getVectorNumElements();
SmallVector<int,8> ShufMask(NumElems, -1);
for (unsigned i = 0; i != NumElems/2; ++i)
ShufMask[i] = i + NumElems/2;
SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
/// Change a vector store into a pair of half-size vector stores.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
SDValue StoredVal = Store->getValue();
assert((StoredVal.getValueType().is256BitVector() ||
StoredVal.getValueType().is512BitVector()) &&
"Expecting 256/512-bit op");
// Splitting volatile memory ops is not allowed unless the operation was not
// legal to begin with. Assume the input store is legal (this transform is
// only used for targets with AVX). Note: It is possible that we have an
// illegal type like v2i128, and so we could allow splitting a volatile store
// in that case if that is important.
if (!Store->isSimple())
return SDValue();
SDLoc DL(Store);
SDValue Value0, Value1;
std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
unsigned HalfOffset = Value0.getValueType().getStoreSize();
SDValue Ptr0 = Store->getBasePtr();
SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);
SDValue Ch0 =
DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
/// type.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
SelectionDAG &DAG) {
SDValue StoredVal = Store->getValue();
assert(StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
StoredVal = DAG.getBitcast(StoreVT, StoredVal);
// Splitting volatile memory ops is not allowed unless the operation was not
// legal to begin with. We are assuming the input op is legal (this transform
// is only used for targets with AVX).
if (!Store->isSimple())
return SDValue();
MVT StoreSVT = StoreVT.getScalarType();
unsigned NumElems = StoreVT.getVectorNumElements();
unsigned ScalarSize = StoreSVT.getStoreSize();
SDLoc DL(Store);
SmallVector<SDValue, 4> Stores;
for (unsigned i = 0; i != NumElems; ++i) {
unsigned Offset = i * ScalarSize;
SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
DAG.getIntPtrConstant(i, DL));
SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
SDLoc dl(St);
SDValue StoredVal = St->getValue();
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
if (StoredVal.getValueType().isVector() &&
StoredVal.getValueType().getVectorElementType() == MVT::i1) {
assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT");
assert(!St->isTruncatingStore() && "Expected non-truncating store");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
DAG.getUNDEF(MVT::v16i1), StoredVal,
DAG.getIntPtrConstant(0, dl));
StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
if (St->isTruncatingStore())
return SDValue();
// If this is a 256-bit store of concatenated ops, we are better off splitting
// that store into two 128-bit stores. This avoids spurious use of 256-bit ops
// and each half can execute independently. Some cores would split the op into
// halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
if (StoreVT.is256BitVector() ||
((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
!Subtarget.hasBWI())) {
SmallVector<SDValue, 4> CatOps;
if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
return splitVectorStore(St, DAG);
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
TargetLowering::TypeWidenVector && "Unexpected type action!");
EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
if (Subtarget.hasSSE2()) {
// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
// and store it.
MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
MVT CastVT = MVT::getVectorVT(StVT, 2);
StoredVal = DAG.getBitcast(CastVT, StoredVal);
StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
DAG.getIntPtrConstant(0, dl));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
assert(Subtarget.hasSSE1() && "Expected SSE");
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
// we'll emit a shuffle and a arithmetic shift.
// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT RegVT = Op.getSimpleValueType();
assert(RegVT.isVector() && "We only custom lower vector loads.");
assert(RegVT.isInteger() &&
"We only custom lower integer vector loads.");
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
if (RegVT.getVectorElementType() == MVT::i1) {
assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getOriginalAlign(),
// Replace chain users with the new chain.
assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
DAG.getBitcast(MVT::v16i1, Val),
DAG.getIntPtrConstant(0, dl));
return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
return SDValue();
/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
/// each of which has no other use apart from the AND / OR.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
Opc = Op.getOpcode();
if (Opc != ISD::OR && Opc != ISD::AND)
return false;
return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
Op.getOperand(0).hasOneUse() &&
Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Cond = Op.getOperand(1);
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
if (Cond.getOpcode() == ISD::SETCC &&
Cond.getOperand(0).getValueType() != MVT::f128) {
SDValue LHS = Cond.getOperand(0);
SDValue RHS = Cond.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
// Special case for
// setcc([su]{add,sub,mul}o == 0)
// setcc([su]{add,sub,mul}o != 1)
if (ISD::isOverflowIntrOpRes(LHS) &&
(CC == ISD::SETEQ || CC == ISD::SETNE) &&
(isNullConstant(RHS) || isOneConstant(RHS))) {
SDValue Value, Overflow;
X86::CondCode X86Cond;
std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
if ((CC == ISD::SETEQ) == isNullConstant(RHS))
X86Cond = X86::GetOppositeBranchCondition(X86Cond);
SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
if (LHS.getSimpleValueType().isInteger()) {
SDValue CCVal;
SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
if (CC == ISD::SETOEQ) {
// For FCMP_OEQ, we can emit
// two branches instead of an explicit AND instruction with a
// separate test. However, we only do this if this block doesn't
// have a fall-through edge, because this requires an explicit
// jmp when the condition is false.
if (Op.getNode()->hasOneUse()) {
SDNode *User = *Op.getNode()->use_begin();
// Look for an unconditional branch following this conditional branch.
// We need this because we need to reverse the successors in order
// to implement FCMP_OEQ.
if (User->getOpcode() == ISD::BR) {
SDValue FalseBB = User->getOperand(1);
SDNode *NewBR =
DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
assert(NewBR == User);
Dest = FalseBB;
SDValue Cmp =
DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
CCVal, Cmp);
CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
} else if (CC == ISD::SETUNE) {
// For FCMP_UNE, we can emit
// two branches instead of an explicit OR instruction with a
// separate test.
SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
Chain =
DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
} else {
X86::CondCode X86Cond =
TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
if (ISD::isOverflowIntrOpRes(Cond)) {
SDValue Value, Overflow;
X86::CondCode X86Cond;
std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
// Look past the truncate if the high bits are known zero.
if (isTruncWithZeroHighBitsInput(Cond, DAG))
Cond = Cond.getOperand(0);
EVT CondVT = Cond.getValueType();
// Add an AND with 1 if we don't already have one.
if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
Cond =
DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
SDValue LHS = Cond;
SDValue RHS = DAG.getConstant(0, dl, CondVT);
SDValue CCVal;
SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
// Calls to _alloca are needed to probe the stack when allocating more than 4k
// bytes in one go. Touching the stack at 4K increments is necessary to ensure
// that the guard pages used by the OS virtual memory manager are allocated in
// correct sequence.
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
bool EmitStackProbeCall = hasStackProbeSymbol(MF);
bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
SplitStack || EmitStackProbeCall;
SDLoc dl(Op);
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
MaybeAlign Alignment(Op.getConstantOperandVal(2));
EVT VT = Node->getValueType(0);
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
bool Is64Bit = Subtarget.is64Bit();
MVT SPTy = getPointerTy(DAG.getDataLayout());
SDValue Result;
if (!Lower) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!");
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
const Align StackAlign = TFI.getStackAlign();
if (hasInlineStackProbe(MF)) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
Register Vreg = MRI.createVirtualRegister(AddrRegClass);
Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
} else {
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Alignment && *Alignment > StackAlign)
Result =
DAG.getNode(ISD::AND, dl, VT, Result,
DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
} else if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
if (Is64Bit) {
// The 64 bit implementation of segmented stacks needs to clobber both r10
// r11. This makes it impossible to use it along with nested parameters.
const Function &F = MF.getFunction();
for (const auto &A : F.args()) {
if (A.hasNestAttr())
report_fatal_error("Cannot use segmented stacks with functions that "
"have nested arguments.");
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
Register Vreg = MRI.createVirtualRegister(AddrRegClass);
Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
} else {
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
Register SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
if (Alignment) {
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
Result = SP;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
SDValue Ops[2] = {Result, Chain};
return DAG.getMergeValues(Ops, dl);
SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
auto PtrVT = getPointerTy(MF.getDataLayout());
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
SDLoc DL(Op);
if (!Subtarget.is64Bit() ||
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
// __va_list_tag:
// gp_offset (0 - 6 * 8)
// fp_offset (48 - 48 + 8 * 16)
// overflow_arg_area (point to parameters coming in memory).
// reg_save_area
SmallVector<SDValue, 8> MemOps;
SDValue FIN = Op.getOperand(1);
// Store gp_offset
SDValue Store = DAG.getStore(
Op.getOperand(0), DL,
DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
// Store fp_offset
FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
Store = DAG.getStore(
Op.getOperand(0), DL,
DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
MachinePointerInfo(SV, 4));
// Store ptr to overflow_arg_area
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
Store =
DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
// Store ptr to reg_save_area.
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
Store = DAG.getStore(
Op.getOperand(0), DL, RSFIN, FIN,
MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget.is64Bit() &&
"LowerVAARG only handles 64-bit va_arg!");
assert(Op.getNumOperands() == 4);
MachineFunction &MF = DAG.getMachineFunction();
if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
// The Win64 ABI uses char* instead of a structure.
return DAG.expandVAArg(Op.getNode());
SDValue Chain = Op.getOperand(0);
SDValue SrcPtr = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
unsigned Align = Op.getConstantOperandVal(3);
SDLoc dl(Op);
EVT ArgVT = Op.getNode()->getValueType(0);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
uint8_t ArgMode;
// Decide which area this value should be read from.
// TODO: Implement the AMD64 ABI in its entirety. This simple
// selection mechanism works only for the basic types.
assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
} else {
assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
"Unhandled argument type in LowerVAARG");
ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
assert(!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
// Insert VAARG_64 node into the DAG
// VAARG_64 returns two values: Variable Argument Address, Chain
SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
DAG.getConstant(ArgMode, dl, MVT::i8),
DAG.getConstant(Align, dl, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(
X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
/*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
Chain = VAARG.getValue(1);
// Load the next argument and return it
return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
// where a va_list is still an i8*.
assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
if (Subtarget.isCallingConvWin64(
// Probably a Win64 va_copy.
return DAG.expandVACopy(Op.getNode());
SDValue Chain = Op.getOperand(0);
SDValue DstPtr = Op.getOperand(1);
SDValue SrcPtr = Op.getOperand(2);
const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
SDLoc DL(Op);
return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
Align(8), /*isVolatile*/ false, false, false,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
switch (Opc) {
case ISD::SHL:
case X86ISD::VSHL:
case X86ISD::VSHLI:
return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
case ISD::SRL:
case X86ISD::VSRL:
case X86ISD::VSRLI:
return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
case ISD::SRA:
case X86ISD::VSRA:
case X86ISD::VSRAI:
return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
llvm_unreachable("Unknown target vector shift node");
/// Handle vector element shifts where the shift amount is a constant.
/// Takes immediate version of shift as input.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue SrcOp, uint64_t ShiftAmt,
SelectionDAG &DAG) {
MVT ElementType = VT.getVectorElementType();
// Bitcast the source vector to the output type, this is mainly necessary for
// vXi8/vXi64 shifts.
if (VT != SrcOp.getSimpleValueType())
SrcOp = DAG.getBitcast(VT, SrcOp);
// Fold this packed shift into its first operand if ShiftAmt is 0.
if (ShiftAmt == 0)
return SrcOp;
// Check for ShiftAmt >= element width
if (ShiftAmt >= ElementType.getSizeInBits()) {
if (Opc == X86ISD::VSRAI)
ShiftAmt = ElementType.getSizeInBits() - 1;
return DAG.getConstant(0, dl, VT);
assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
&& "Unknown target vector shift-by-constant node");
// Fold this packed vector shift into a build vector if SrcOp is a
// vector of Constants or UNDEFs.
if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
SmallVector<SDValue, 8> Elts;
unsigned NumElts = SrcOp->getNumOperands();
switch (Opc) {
default: llvm_unreachable("Unknown opcode!");
case X86ISD::VSHLI:
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
// Must produce 0s in the correct bits.
Elts.push_back(DAG.getConstant(0, dl, ElementType));
auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
case X86ISD::VSRLI:
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
// Must produce 0s in the correct bits.
Elts.push_back(DAG.getConstant(0, dl, ElementType));
auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
case X86ISD::VSRAI:
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
// All shifted in bits must be the same so use 0.
Elts.push_back(DAG.getConstant(0, dl, ElementType));
auto *ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
return DAG.getBuildVector(VT, dl, Elts);
return DAG.getNode(Opc, dl, VT, SrcOp,
DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
/// Handle vector element shifts where the shift amount may or may not be a
/// constant. Takes immediate version of shift as input.
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
SDValue SrcOp, SDValue ShAmt,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT SVT = ShAmt.getSimpleValueType();
assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
// Catch shift-by-constant.
if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
CShAmt->getZExtValue(), DAG);
// Change opcode to non-immediate version.
Opc = getTargetVShiftUniformOpcode(Opc, true);
// Need to build a vector containing shift amount.
// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
// +====================+============+=======================================+
// | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
// +====================+============+=======================================+
// | i64 | Yes, No | Use ShAmt as lowest elt |
// | i32 | Yes | zero-extend in-reg |
// | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
// | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
// | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
// +====================+============+=======================================+
if (SVT == MVT::i64)
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
(ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
ShAmt = ShAmt.getOperand(0);
MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
if (Subtarget.hasSSE41())
MVT::v2i64, ShAmt);
else {
SDValue ByteShift = DAG.getTargetConstant(
(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
} else if (Subtarget.hasSSE41() &&
ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
MVT::v2i64, ShAmt);
} else {
SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
// The return type has to be a 128-bit type with the same element
// type as the input type.
MVT EltVT = VT.getVectorElementType();
MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
ShAmt = DAG.getBitcast(ShVT, ShAmt);
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
/// Return Mask with the necessary casting or extending
/// for \p Mask according to \p MaskVT when lowering masking intrinsics
static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl) {
if (isAllOnesConstant(Mask))
return DAG.getConstant(1, dl, MaskVT);
if (X86::isZeroNode(Mask))
return DAG.getConstant(0, dl, MaskVT);
assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
// In case 32bit mode, bitcast i64 is illegal, extend/split it.
SDValue Lo, Hi;
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
DAG.getConstant(0, dl, MVT::i32));
Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
DAG.getConstant(1, dl, MVT::i32));
Lo = DAG.getBitcast(MVT::v32i1, Lo);
Hi = DAG.getBitcast(MVT::v32i1, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
} else {
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
DAG.getBitcast(BitcastVT, Mask),
DAG.getIntPtrConstant(0, dl));
/// Return (and \p Op, \p Mask) for compare instructions or
/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
/// necessary casting or extending for \p Mask when lowering masking intrinsics
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
unsigned OpcodeSelect = ISD::VSELECT;
SDLoc dl(Op);
if (isAllOnesConstant(Mask))
return Op;
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
/// Creates an SDNode for a predicated scalar operation.
/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
/// The mask is coming as MVT::i8 and it should be transformed
/// to MVT::v1i1 while lowering masking intrinsics.
/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
/// "X86select" instead of "vselect". We just can't create the "vselect" node
/// for a scalar instruction.
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
if (MaskConst->getZExtValue() & 0x1)
return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
DAG.getBitcast(MVT::v8i1, Mask),
DAG.getIntPtrConstant(0, dl));
if (Op.getOpcode() == X86ISD::FSETCCM ||
Op.getOpcode() == X86ISD::FSETCCM_SAE ||
Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
static int getSEHRegistrationNodeSize(const Function *Fn) {
if (!Fn->hasPersonalityFn())
"querying registration node size for function without personality");
// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
// WinEHStatePass for the full struct definition.
switch (classifyEHPersonality(Fn->getPersonalityFn())) {
case EHPersonality::MSVC_X86SEH: return 24;
case EHPersonality::MSVC_CXX: return 16;
default: break;
"can only recover FP for 32-bit MSVC EH personality functions");
/// When the MSVC runtime transfers control to us, either to an outlined
/// function or when returning to a parent frame after catching an exception, we
/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
/// Here's the math:
/// RegNodeBase = EntryEBP - RegNodeSize
/// ParentFP = RegNodeBase - ParentFrameOffset
/// Subtracting RegNodeSize takes us to the offset of the registration node, and
/// subtracting the offset (negative on x86) takes us back to the parent FP.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
SDValue EntryEBP) {
MachineFunction &MF = DAG.getMachineFunction();
SDLoc dl;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
// It's possible that the parent function no longer has a personality function
// if the exceptional code was optimized away, in which case we just return
// the incoming EBP.
if (!Fn->hasPersonalityFn())
return EntryEBP;
// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
// registration, or the .set_setframe offset.
MCSymbol *OffsetSym =
SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
SDValue ParentFrameOffset =
DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
// prologue to RBP in the parent function.
const X86Subtarget &Subtarget =
static_cast<const X86Subtarget &>(DAG.getSubtarget());
if (Subtarget.is64Bit())
return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
int RegNodeSize = getSEHRegistrationNodeSize(Fn);
// RegNodeBase = EntryEBP - RegNodeSize
// ParentFP = RegNodeBase - ParentFrameOffset
SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
DAG.getConstant(RegNodeSize, dl, PtrVT));
return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
// Helper to detect if the operand is CUR_DIRECTION rounding mode.
auto isRoundModeCurDirection = [](SDValue Rnd) {
if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
return false;
auto isRoundModeSAE = [](SDValue Rnd) {
if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
unsigned RC = C->getZExtValue();
// Clear the NO_EXC bit and check remaining bits.
// As a convenience we allow no other bits or explicitly
// current direction.
return false;
auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
RC = C->getZExtValue();
// Clear the NO_EXC bit and check remaining bits.
return false;
SDLoc dl(Op);
unsigned IntNo = Op.getConstantOperandVal(0);
MVT VT = Op.getSimpleValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
if (IntrData) {
switch(IntrData->Type) {
case INTR_TYPE_1OP: {
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(2);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
DAG.getTargetConstant(RC, dl, MVT::i32));
if (!isRoundModeCurDirection(Rnd))
return SDValue();
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
SDValue Sae = Op.getOperand(2);
unsigned Opc;
if (isRoundModeCurDirection(Sae))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
return SDValue();
return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
case INTR_TYPE_2OP: {
SDValue Src2 = Op.getOperand(2);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(3);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
Op.getOperand(1), Src2,
DAG.getTargetConstant(RC, dl, MVT::i32));
if (!isRoundModeCurDirection(Rnd))
return SDValue();
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Src2);
SDValue Sae = Op.getOperand(3);
unsigned Opc;
if (isRoundModeCurDirection(Sae))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
return SDValue();
return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
case INTR_TYPE_3OP_IMM8: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
Src1, Src2, Src3,
DAG.getTargetConstant(RC, dl, MVT::i32));
if (!isRoundModeCurDirection(Rnd))
return SDValue();
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
{Src1, Src2, Src3});
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
// We add rounding mode to the Node when
// - RC Opcode is specified and
// - RC is not "current direction".
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
if (IntrWithRoundingModeOpcode != 0) {
SDValue Rnd = Op.getOperand(4);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return getVectorMaskingNode(
DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
Mask, PassThru, Subtarget, DAG);
if (!isRoundModeCurDirection(Rnd))
return SDValue();
return getVectorMaskingNode(
DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
Subtarget, DAG);
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Rnd = Op.getOperand(4);
unsigned Opc;
if (isRoundModeCurDirection(Rnd))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Rnd))
Opc = IntrData->Opc1;
return SDValue();
return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
Subtarget, DAG);
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
// There are 2 kinds of intrinsics in this group:
// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
// (2) With rounding mode and sae - 7 operands.
bool HasRounding = IntrWithRoundingModeOpcode != 0;
if (Op.getNumOperands() == (5U + HasRounding)) {
if (HasRounding) {
SDValue Rnd = Op.getOperand(5);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
return getScalarMaskingNode(
DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
DAG.getTargetConstant(RC, dl, MVT::i32)),
Mask, passThru, Subtarget, DAG);
if (!isRoundModeCurDirection(Rnd))
return SDValue();
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
Mask, passThru, Subtarget, DAG);
assert(Op.getNumOperands() == (6U + HasRounding) &&
"Unexpected intrinsic form");
SDValue RoundingMode = Op.getOperand(5);
unsigned Opc = IntrData->Opc0;
if (HasRounding) {
SDValue Sae = Op.getOperand(6);
if (isRoundModeSAE(Sae))
Opc = IntrWithRoundingModeOpcode;
else if (!isRoundModeCurDirection(Sae))
return SDValue();
return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
Src2, RoundingMode),
Mask, passThru, Subtarget, DAG);
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue Rnd = Op.getOperand(5);
SDValue NewOp;
unsigned RC = 0;
if (isRoundModeCurDirection(Rnd))
NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
else if (isRoundModeSAEToX(Rnd, RC))
NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
DAG.getTargetConstant(RC, dl, MVT::i32));
return SDValue();
return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue Sae = Op.getOperand(5);
unsigned Opc;
if (isRoundModeCurDirection(Sae))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
return SDValue();
return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
Mask, passThru, Subtarget, DAG);
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue NewOp;
if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
unsigned RC = 0;
if (isRoundModeSAEToX(Rnd, RC))
NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
DAG.getTargetConstant(RC, dl, MVT::i32));
else if (!isRoundModeCurDirection(Rnd))
return SDValue();
if (!NewOp)
NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
unsigned Opc = IntrData->Opc0;
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(5);
if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else if (!isRoundModeCurDirection(Sae))
return SDValue();
return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
Mask, PassThru, Subtarget, DAG);
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Sae = Op.getOperand(6);
unsigned Opc;
if (isRoundModeCurDirection(Sae))
Opc = IntrData->Opc0;
else if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
return SDValue();
return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
unsigned Opc = IntrData->Opc0;
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(6);
if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else if (!isRoundModeCurDirection(Sae))
return SDValue();
return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
case BLENDV: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
Src3 = DAG.getBitcast(MaskVT, Src3);
// Reverse the operands to match VSELECT order.
return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
case VPERM_2OP : {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
// Swap Src1 and Src2 in the node creation
return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
case IFMA_OP:
// NOTE: We need to swizzle the operands to pass the multiply operands
// first.
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
case FPCLASSS: {
SDValue Src1 = Op.getOperand(1);
SDValue Imm = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
Subtarget, DAG);
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
DAG.getConstant(0, dl, MVT::v8i1),
FPclassMask, DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(MVT::i8, Ins);
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
SDValue CC = Op.getOperand(3);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(4);
if (isRoundModeSAE(Sae))
return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2), CC, Sae);
if (!isRoundModeCurDirection(Sae))
return SDValue();
//default rounding mode
return DAG.getNode(IntrData->Opc0, dl, MaskVT,
{Op.getOperand(1), Op.getOperand(2), CC});
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue CC = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue Cmp;
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(5);
if (isRoundModeSAE(Sae))
Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
else if (!isRoundModeCurDirection(Sae))
return SDValue();
//default rounding mode
if (!Cmp.getNode())
Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
DAG.getConstant(0, dl, MVT::v8i1),
CmpMask, DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(MVT::i8, Ins);
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
// Some conditions require the operands to be swapped.
if (CC == ISD::SETLT || CC == ISD::SETLE)
std::swap(LHS, RHS);
SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
SDValue SetCC;
switch (CC) {
case ISD::SETEQ: { // (ZF = 0 and PF = 0)
SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
case ISD::SETNE: { // (ZF = 1 or PF = 1)
SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
case ISD::SETGT: // (CF = 0 and ZF = 0)
case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
case ISD::SETGE: // CF = 0
case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
llvm_unreachable("Unexpected illegal condition!");
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
case COMI_RM: { // Comparison intrinsics with Sae
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
unsigned CondVal = Op.getConstantOperandVal(3);
SDValue Sae = Op.getOperand(4);
SDValue FCmp;
if (isRoundModeCurDirection(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
DAG.getTargetConstant(CondVal, dl, MVT::i8));
else if (isRoundModeSAE(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
return SDValue();
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
DAG.getConstant(0, dl, MVT::v16i1),
FCmp, DAG.getIntPtrConstant(0, dl));
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
DAG.getBitcast(MVT::i16, Ins));
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), Subtarget,
SDValue Mask = Op.getOperand(3);
SDValue DataToCompress = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
return Op.getOperand(1);
// Avoid false dependency.
if (PassThru.isUndef())
PassThru = DAG.getConstant(0, dl, VT);
return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue Imm = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Passthru = (IntrData->Type == FIXUPIMM)
? Src1
: getZeroVector(VT, Subtarget, DAG, dl);
unsigned Opc = IntrData->Opc0;
if (IntrData->Opc1 != 0) {
SDValue Sae = Op.getOperand(6);
if (isRoundModeSAE(Sae))
Opc = IntrData->Opc1;
else if (!isRoundModeCurDirection(Sae))
return SDValue();
SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
case ROUNDP: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
auto Round = cast<ConstantSDNode>(Op.getOperand(2));
SDValue RoundingMode =
DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), RoundingMode);
case ROUNDS: {
assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
auto Round = cast<ConstantSDNode>(Op.getOperand(3));
SDValue RoundingMode =
DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), RoundingMode);
case BEXTRI: {
assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");
// The control is a TargetConstant, but we need to convert it to a
// ConstantSDNode.
uint64_t Imm = Op.getConstantOperandVal(2);
SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Control);
case ADX: {
SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
SDValue Res;
// If the carry in is zero, then we should just use ADD/SUB instead of
if (isNullConstant(Op.getOperand(1))) {
Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
} else {
SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
DAG.getConstant(-1, dl, MVT::i8));
Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
Op.getOperand(3), GenCF.getValue(1));
SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
SDValue Results[] = { SetCC, Res };
return DAG.getMergeValues(Results, dl);
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
if (isAllOnesConstant(Mask))
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
{Src, PassThru, Mask});
SDValue Src = Op.getOperand(1);
SDValue Rnd = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
if (isAllOnesConstant(Mask))
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
PassThru, Mask);
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
if (ISD::isBuildVectorAllOnes(Mask.getNode()))
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
// Break false dependency.
if (PassThru.isUndef())
PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
case Intrinsic::x86_avx512_ktestc_b:
case Intrinsic::x86_avx512_ktestc_w:
case Intrinsic::x86_avx512_ktestc_d:
case Intrinsic::x86_avx512_ktestc_q:
case Intrinsic::x86_avx512_ktestz_b:
case Intrinsic::x86_avx512_ktestz_w:
case Intrinsic::x86_avx512_ktestz_d:
case Intrinsic::x86_avx512_ktestz_q:
case Intrinsic::x86_sse41_ptestz:
case Intrinsic::x86_sse41_ptestc:
case Intrinsic::x86_sse41_ptestnzc:
case Intrinsic::x86_avx_ptestz_256:
case Intrinsic::x86_avx_ptestc_256:
case Intrinsic::x86_avx_ptestnzc_256:
case Intrinsic::x86_avx_vtestz_ps:
case Intrinsic::x86_avx_vtestc_ps:
case Intrinsic::x86_avx_vtestnzc_ps:
case Intrinsic::x86_avx_vtestz_pd:
case Intrinsic::x86_avx_vtestc_pd:
case Intrinsic::x86_avx_vtestnzc_pd:
case Intrinsic::x86_avx_vtestz_ps_256:
case Intrinsic::x86_avx_vtestc_ps_256:
case Intrinsic::x86_avx_vtestnzc_ps_256:
case Intrinsic::x86_avx_vtestz_pd_256:
case Intrinsic::x86_avx_vtestc_pd_256:
case Intrinsic::x86_avx_vtestnzc_pd_256: {
unsigned TestOpc = X86ISD::PTEST;
X86::CondCode X86CC;
switch (IntNo) {
default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
case Intrinsic::x86_avx512_ktestc_b:
case Intrinsic::x86_avx512_ktestc_w:
case Intrinsic::x86_avx512_ktestc_d:
case Intrinsic::x86_avx512_ktestc_q:
// CF = 1
TestOpc = X86ISD::KTEST;
X86CC = X86::COND_B;
case Intrinsic::x86_avx512_ktestz_b:
case Intrinsic::x86_avx512_ktestz_w:
case Intrinsic::x86_avx512_ktestz_d:
case Intrinsic::x86_avx512_ktestz_q:
TestOpc = X86ISD::KTEST;
X86CC = X86::COND_E;
case Intrinsic::x86_avx_vtestz_ps:
case Intrinsic::x86_avx_vtestz_pd:
case Intrinsic::x86_avx_vtestz_ps_256:
case Intrinsic::x86_avx_vtestz_pd_256:
TestOpc = X86ISD::TESTP;
case Intrinsic::x86_sse41_ptestz:
case Intrinsic::x86_avx_ptestz_256:
// ZF = 1
X86CC = X86::COND_E;
case Intrinsic::x86_avx_vtestc_ps:
case Intrinsic::x86_avx_vtestc_pd:
case Intrinsic::x86_avx_vtestc_ps_256:
case Intrinsic::x86_avx_vtestc_pd_256:
TestOpc = X86ISD::TESTP;
case Intrinsic::x86_sse41_ptestc:
case Intrinsic::x86_avx_ptestc_256:
// CF = 1
X86CC = X86::COND_B;
case Intrinsic::x86_avx_vtestnzc_ps:
case Intrinsic::x86_avx_vtestnzc_pd:
case Intrinsic::x86_avx_vtestnzc_ps_256:
case Intrinsic::x86_avx_vtestnzc_pd_256:
TestOpc = X86ISD::TESTP;
case Intrinsic::x86_sse41_ptestnzc:
case Intrinsic::x86_avx_ptestnzc_256:
// ZF and CF = 0
X86CC = X86::COND_A;
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
case Intrinsic::x86_sse42_pcmpistria128:
case Intrinsic::x86_sse42_pcmpestria128:
case Intrinsic::x86_sse42_pcmpistric128:
case Intrinsic::x86_sse42_pcmpestric128:
case Intrinsic::x86_sse42_pcmpistrio128:
case Intrinsic::x86_sse42_pcmpestrio128:
case Intrinsic::x86_sse42_pcmpistris128:
case Intrinsic::x86_sse42_pcmpestris128:
case Intrinsic::x86_sse42_pcmpistriz128:
case Intrinsic::x86_sse42_pcmpestriz128: {
unsigned Opcode;
X86::CondCode X86CC;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse42_pcmpistria128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_A;
case Intrinsic::x86_sse42_pcmpestria128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_A;
case Intrinsic::x86_sse42_pcmpistric128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_B;
case Intrinsic::x86_sse42_pcmpestric128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_B;
case Intrinsic::x86_sse42_pcmpistrio128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_O;
case Intrinsic::x86_sse42_pcmpestrio128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_O;
case Intrinsic::x86_sse42_pcmpistris128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_S;
case Intrinsic::x86_sse42_pcmpestris128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_S;
case Intrinsic::x86_sse42_pcmpistriz128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_E;
case Intrinsic::x86_sse42_pcmpestriz128:
Opcode = X86ISD::PCMPESTR;
X86CC = X86::COND_E;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
case Intrinsic::x86_sse42_pcmpistri128:
case Intrinsic::x86_sse42_pcmpestri128: {
unsigned Opcode;
if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
Opcode = X86ISD::PCMPISTR;
Opcode = X86ISD::PCMPESTR;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
case Intrinsic::x86_sse42_pcmpistrm128:
case Intrinsic::x86_sse42_pcmpestrm128: {
unsigned Opcode;
if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
Opcode = X86ISD::PCMPISTR;
Opcode = X86ISD::PCMPESTR;
SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
case Intrinsic::eh_sjlj_lsda: {
MachineFunction &MF = DAG.getMachineFunction();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
auto &Context = MF.getMMI().getContext();
MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
return DAG.getNode(getGlobalWrapperKind(), dl, VT,
DAG.getMCSymbol(S, PtrVT));
case Intrinsic::x86_seh_lsda: {
// Compute the symbol for the LSDA. We know it'll get emitted later.
MachineFunction &MF = DAG.getMachineFunction();
SDValue Op1 = Op.getOperand(1);
auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
// Generate a simple absolute symbol reference. This intrinsic is only
// supported on 32-bit Windows, which isn't PIC.
SDValue Result = DAG.getMCSymbol(LSDASym, VT);
return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
case Intrinsic::eh_recoverfp: {
SDValue FnOp = Op.getOperand(1);
SDValue IncomingFPOp = Op.getOperand(2);
GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
if (!Fn)
" must take a function as the first argument");
return recoverFramePointer(DAG, Fn, IncomingFPOp);
case Intrinsic::localaddress: {
// Returns one of the stack, base, or frame pointer registers, depending on
// which is used to reference local variables.
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned Reg;
if (RegInfo->hasBasePointer(MF))
Reg = RegInfo->getBaseRegister();
else { // Handles the SP or FP case.
bool CantUseFP = RegInfo->needsStackRealignment(MF);
if (CantUseFP)
Reg = RegInfo->getPtrSizedStackRegister(MF);
Reg = RegInfo->getPtrSizedFrameRegister(MF);
return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
case Intrinsic::x86_avx512_vp2intersect_q_512:
case Intrinsic::x86_avx512_vp2intersect_q_256:
case Intrinsic::x86_avx512_vp2intersect_q_128:
case Intrinsic::x86_avx512_vp2intersect_d_512:
case Intrinsic::x86_avx512_vp2intersect_d_256:
case Intrinsic::x86_avx512_vp2intersect_d_128: {
MVT MaskVT = Op.getSimpleValueType();
SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
SDLoc DL(Op);
SDValue Operation =
Op->getOperand(1), Op->getOperand(2));
SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
MaskVT, Operation);
SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
MaskVT, Operation);
return DAG.getMergeValues({Result0, Result1}, DL);
case Intrinsic::x86_mmx_pslli_w:
case Intrinsic::x86_mmx_pslli_d:
case Intrinsic::x86_mmx_pslli_q:
case Intrinsic::x86_mmx_psrli_w:
case Intrinsic::x86_mmx_psrli_d:
case Intrinsic::x86_mmx_psrli_q:
case Intrinsic::x86_mmx_psrai_w:
case Intrinsic::x86_mmx_psrai_d: {
SDLoc DL(Op);
SDValue ShAmt = Op.getOperand(2);
// If the argument is a constant, convert it to a target constant.
if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
// Clamp out of bounds shift amounts since they will otherwise be masked
// to 8-bits which may make it no longer out of bounds.
unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
if (ShiftAmount == 0)
return Op.getOperand(1);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
Op.getOperand(0), Op.getOperand(1),
DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
unsigned NewIntrinsic;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_mmx_pslli_w:
NewIntrinsic = Intrinsic::x86_mmx_psll_w;
case Intrinsic::x86_mmx_pslli_d:
NewIntrinsic = Intrinsic::x86_mmx_psll_d;
case Intrinsic::x86_mmx_pslli_q:
NewIntrinsic = Intrinsic::x86_mmx_psll_q;
case Intrinsic::x86_mmx_psrli_w:
NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
case Intrinsic::x86_mmx_psrli_d:
NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
case Intrinsic::x86_mmx_psrli_q:
NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
case Intrinsic::x86_mmx_psrai_w:
NewIntrinsic = Intrinsic::x86_mmx_psra_w;
case Intrinsic::x86_mmx_psrai_d:
NewIntrinsic = Intrinsic::x86_mmx_psra_d;
// The vector shift intrinsics with scalars uses 32b shift amounts but
// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
// MMX register.
ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
DAG.getConstant(NewIntrinsic, DL, MVT::i32),
Op.getOperand(1), ShAmt);
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
if (!C)
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
// Cast mask to an integer type.
Mask = DAG.getBitcast(MaskVT, Mask);
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
SDValue Res =
DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
if (!C)
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
// We support two versions of the gather intrinsics. One with scalar mask and
// one with vXi1 mask. Convert scalar to vXi1 if necessary.
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
SDValue Res =
DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
if (!C)
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
// We support two versions of the scatter intrinsics. One with scalar mask and
// one with vXi1 mask. Convert scalar to vXi1 if necessary.
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
SDValue Res =
DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return Res;
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Mask, SDValue Base, SDValue Index,
SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
if (!C)
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
MVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
return SDValue(Res, 0);
/// Handles the lowering of builtin intrinsics with chain that return their
/// value into registers EDX:EAX.
/// If operand ScrReg is a valid register identifier, then operand 2 of N is
/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
/// TargetOpcode.
/// Returns a Glue value which can be used to add extra copy-from-reg if the
/// expanded intrinsics implicitly defines extra registers (i.e. not just
/// EDX:EAX).
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
unsigned TargetOpcode,
unsigned SrcReg,
const X86Subtarget &Subtarget,
SmallVectorImpl<SDValue> &Results) {
SDValue Chain = N->getOperand(0);
SDValue Glue;
if (SrcReg) {
assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
Glue = Chain.getValue(1);
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue N1Ops[] = {Chain, Glue};
SDNode *N1 = DAG.getMachineNode(
TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
Chain = SDValue(N1, 0);
// Reads the content of XCR and returns it in registers EDX:EAX.
SDValue LO, HI;
if (Subtarget.is64Bit()) {
LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
} else {
LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
Chain = HI.getValue(1);
Glue = HI.getValue(2);
if (Subtarget.is64Bit()) {
// Merge the two 32-bit values into a 64-bit one.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
DAG.getConstant(32, DL, MVT::i8));
Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
return Glue;
// Use a buildpair to merge the two 32-bit values into a 64-bit one.
SDValue Ops[] = { LO, HI };
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
return Glue;
/// Handles the lowering of builtin intrinsics that read the time stamp counter
/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SmallVectorImpl<SDValue> &Results) {
// The processor's time-stamp counter (a 64-bit MSR) is stored into the
// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
// and the EAX register is loaded with the low-order 32 bits.
SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
/* NoRegister */0, Subtarget,
if (Opcode != X86::RDTSCP)
SDValue Chain = Results[1];
// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
// the ECX register. Add 'ecx' explicitly to the chain.
SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
Results[1] = ecx;
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<SDValue, 3> Results;
SDLoc DL(Op);
getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
return DAG.getMergeValues(Results, DL);
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
SDValue RegNode = Op.getOperand(2);
WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
if (!EHInfo)
report_fatal_error("EH registrations only live in functions using WinEH");
// Cast the operand to an alloca, and remember the frame index.
auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
if (!FINode)
report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
// Return the chain operand without making any DAG nodes.
return Chain;
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
SDValue EHGuard = Op.getOperand(2);
WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
if (!EHInfo)
report_fatal_error("EHGuard only live in functions using WinEH");
// Cast the operand to an alloca, and remember the frame index.
auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
if (!FINode)
report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
EHInfo->EHGuardFrameIndex = FINode->getIndex();
// Return the chain operand without making any DAG nodes.
return Chain;
/// Emit Truncating Store with signed or unsigned saturation.
static SDValue
EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
SelectionDAG &DAG) {
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
SDValue Ops[] = { Chain, Val, Ptr, Undef };
unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
/// Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue
EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
MachineMemOperand *MMO, SelectionDAG &DAG) {
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = { Chain, Val, Ptr, Mask };
return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
unsigned IntNo = Op.getConstantOperandVal(1);
const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
switch (IntNo) {
case llvm::Intrinsic::x86_seh_ehregnode:
return MarkEHRegistrationNode(Op, DAG);
case llvm::Intrinsic::x86_seh_ehguard:
return MarkEHGuard(Op, DAG);
case llvm::Intrinsic::x86_rdpkru: {
SDLoc dl(Op);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
// Create a RDPKRU node and pass 0 to the ECX parameter.
return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
DAG.getConstant(0, dl, MVT::i32));
case llvm::Intrinsic::x86_wrpkru: {
SDLoc dl(Op);
// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
// to the EDX and ECX parameters.
return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
Op.getOperand(0), Op.getOperand(2),
DAG.getConstant(0, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
case llvm::Intrinsic::x86_flags_read_u32:
case llvm::Intrinsic::x86_flags_read_u64:
case llvm::Intrinsic::x86_flags_write_u32:
case llvm::Intrinsic::x86_flags_write_u64: {
// We need a frame pointer because this will get lowered to a PUSH/POP
// sequence.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
// Don't do anything here, we will expand these intrinsics out later
// during FinalizeISel in EmitInstrWithCustomInserter.
return Op;
case Intrinsic::x86_lwpins32:
case Intrinsic::x86_lwpins64:
case Intrinsic::x86_umwait:
case Intrinsic::x86_tpause: {
SDLoc dl(Op);
SDValue Chain = Op->getOperand(0);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
unsigned Opcode;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic");
case Intrinsic::x86_umwait:
Opcode = X86ISD::UMWAIT;
case Intrinsic::x86_tpause:
Opcode = X86ISD::TPAUSE;
case Intrinsic::x86_lwpins32:
case Intrinsic::x86_lwpins64:
Opcode = X86ISD::LWPINS;
SDValue Operation =
DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
Op->getOperand(3), Op->getOperand(4));
SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
case Intrinsic::x86_enqcmd:
case Intrinsic::x86_enqcmds: {
SDLoc dl(Op);
SDValue Chain = Op.getOperand(0);
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
unsigned Opcode;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic!");
case Intrinsic::x86_enqcmd:
Opcode = X86ISD::ENQCMD;
case Intrinsic::x86_enqcmds:
Opcode = X86ISD::ENQCMDS;
SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
return SDValue();
SDLoc dl(Op);
switch(IntrData->Type) {
default: llvm_unreachable("Unknown Intrinsic Type");
case RDSEED:
case RDRAND: {
// Emit the node with the right value type.
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
DAG.getConstant(1, dl, Op->getValueType(1)),
DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
SDValue(Result.getNode(), 1)};
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
// Return { result, isValid, chain }.
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
case GATHER_AVX2: {
SDValue Chain = Op.getOperand(0);
SDValue Src = Op.getOperand(2);
SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
Scale, Chain, Subtarget);
case GATHER: {
//gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
SDValue Src = Op.getOperand(2);
SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
Chain, Subtarget);
case SCATTER: {
//scatter(base, mask, index, v1, scale);
SDValue Chain = Op.getOperand(0);
SDValue Base = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
Scale, Chain, Subtarget);
case PREFETCH: {
const APInt &HintVal = Op.getConstantOperandAPInt(6);
assert((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3");
unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
SDValue Chain = Op.getOperand(0);
SDValue Mask = Op.getOperand(2);
SDValue Index = Op.getOperand(3);
SDValue Base = Op.getOperand(4);
SDValue Scale = Op.getOperand(5);
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
case RDTSC: {
SmallVector<SDValue, 2> Results;
getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
return DAG.getMergeValues(Results, dl);
// Read Performance Monitoring Counters.
case RDPMC:
// GetExtended Control Register.
case XGETBV: {
SmallVector<SDValue, 2> Results;
// RDPMC uses ECX to select the index of the performance counter to read.
// XGETBV uses ECX to select the index of the XCR register to return.
// The result is stored into registers EDX:EAX.
expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
Subtarget, Results);
return DAG.getMergeValues(Results, dl);
// XTEST intrinsics.
case XTEST: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
Ret, SDValue(InTrans.getNode(), 1));
SDValue Mask = Op.getOperand(4);
SDValue DataToTruncate = Op.getOperand(3);
SDValue Addr = Op.getOperand(2);
SDValue Chain = Op.getOperand(0);
MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
assert(MemIntr && "Expected MemIntrinsicSDNode!");
EVT MemVT = MemIntr->getMemoryVT();
uint16_t TruncationOp = IntrData->Opc0;
switch (TruncationOp) {
case X86ISD::VTRUNC: {
if (isAllOnesConstant(Mask)) // return just a truncate store
return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDValue Offset = DAG.getUNDEF(VMask.getValueType());
return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
true /* truncating */);
case X86ISD::VTRUNCS: {
bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
if (isAllOnesConstant(Mask))
return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
MemIntr->getMemOperand(), DAG);
MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
VMask, MemVT, MemIntr->getMemOperand(), DAG);
llvm_unreachable("Unsupported truncstore intrinsic");
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
unsigned Depth = Op.getConstantOperandVal(0);
SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
// Just load the return address.
SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
return getReturnAddressFrameIndex(DAG);
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
EVT VT = Op.getValueType();
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
// Depth > 0 makes no sense on targets which use Windows unwind codes. It
// is not possible to crawl up the stack without looking at the unwind codes
// simultaneously.
int FrameAddrIndex = FuncInfo->getFAIndex();
if (!FrameAddrIndex) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
return DAG.getFrameIndex(FrameAddrIndex, VT);
unsigned FrameReg =
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = Op.getConstantOperandVal(0);
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
return FrameAddr;
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
Register Reg = StringSwitch<unsigned>(RegName)
.Case("esp", X86::ESP)
.Case("rsp", X86::RSP)
.Case("ebp", X86::EBP)
.Case("rbp", X86::RBP)
if (Reg == X86::EBP || Reg == X86::RBP) {
if (!TFI.hasFP(MF))
report_fatal_error("register " + StringRef(RegName) +
" is allocatable: function has no frame pointer");
#ifndef NDEBUG
else {
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
"Invalid Frame Register!");
if (Reg)
return Reg;
report_fatal_error("Invalid register name global variable");
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
Register X86TargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
Register X86TargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
// Funclet personalities don't use selectors (the runtime does the selection).
return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
bool X86TargetLowering::needsFixedCatchObjects() const {
return Subtarget.isTargetWin64();
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
SDValue Offset = Op.getOperand(1);
SDValue Handler = Op.getOperand(2);
SDLoc dl (Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
DAG.getRegister(StoreAddrReg, PtrVT));
SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
// If the subtarget is not 64bit, we may need the global base reg
// after isel expand pseudo, i.e., after CGBR pass ran.
// Therefore, ask for the GlobalBaseReg now, so that the pass
// inserts the code for us in case we need it.
// Otherwise, we will end up in a situation where we will
// reference a virtual register that is not defined!
if (!Subtarget.is64Bit()) {
const X86InstrInfo *TII = Subtarget.getInstrInfo();
return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
DAG.getVTList(MVT::i32, MVT::Other),
Op.getOperand(0), Op.getOperand(1));
SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
Op.getOperand(0), Op.getOperand(1));
SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
return Op.getOperand(0);
SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SelectionDAG &DAG) const {
SDValue Root = Op.getOperand(0);
SDValue Trmp = Op.getOperand(1); // trampoline
SDValue FPtr = Op.getOperand(2); // nested function
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
if (Subtarget.is64Bit()) {
SDValue OutChains[6];
// Large code-model.
const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
// Load the pointer to the nested function into R11.
unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
SDValue Addr = Trmp;
OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(2, dl, MVT::i64));
OutChains[1] =
DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
/* Alignment = */ 2);
// Load the 'nest' parameter value into R10.
// R10 is specified in
OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(10, dl, MVT::i64));
OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr, 10));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(12, dl, MVT::i64));
OutChains[3] =
DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
/* Alignment = */ 2);
// Jump to the nested function.
OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(20, dl, MVT::i64));
OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr, 20));
unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(22, dl, MVT::i64));
OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
Addr, MachinePointerInfo(TrmpAddr, 22));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
} else {
const Function *Func =
CallingConv::ID CC = Func->getCallingConv();
unsigned NestReg;
switch (CC) {
llvm_unreachable("Unsupported calling convention");
case CallingConv::C:
case CallingConv::X86_StdCall: {
// Pass 'nest' parameter in ECX.
// Must be kept in sync with
NestReg = X86::ECX;
// Check that ECX wasn't needed by an 'inreg' parameter.
FunctionType *FTy = Func->getFunctionType();
const AttributeList &Attrs = Func->getAttributes();
if (!Attrs.isEmpty() && !Func->isVarArg()) {
unsigned InRegCount = 0;
unsigned Idx = 1;
for (FunctionType::param_iterator I = FTy->param_begin(),
E = FTy->param_end(); I != E; ++I, ++Idx)
if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
auto &DL = DAG.getDataLayout();
// FIXME: should only count parameters that are lowered to integers.
InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
if (InRegCount > 2) {
report_fatal_error("Nest register in use - reduce number of inreg"
" parameters!");
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::Fast:
case CallingConv::Tail:
// Pass 'nest' parameter in EAX.
// Must be kept in sync with
NestReg = X86::EAX;
SDValue OutChains[4];
SDValue Addr, Disp;
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(10, dl, MVT::i32));
Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
// This is storing the opcode for MOV32ri.
const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
OutChains[0] =
DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
Trmp, MachinePointerInfo(TrmpAddr));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(1, dl, MVT::i32));
OutChains[1] =
DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
/* Alignment = */ 1);
const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(5, dl, MVT::i32));
OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
Addr, MachinePointerInfo(TrmpAddr, 5),
/* Alignment = */ 1);
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(6, dl, MVT::i32));
OutChains[3] =
DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
/* Alignment = */ 1);
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SelectionDAG &DAG) const {
The rounding mode is in bits 11:10 of FPSR, and has the following
00 Round to nearest
01 Round to -inf
10 Round to +inf
11 Round to 0
FLT_ROUNDS, on the other hand, expects the following:
-1 Undefined
0 Round to 0
1 Round to nearest
2 Round to +inf
3 Round to -inf
To perform the conversion, we use a packed lookup table of the four 2-bit
values that we can index by FPSP[11:10]
0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
(0x2d >> ((FPSR & 0xc00) >> 9)) & 3
MachineFunction &MF = DAG.getMachineFunction();
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
// Save FP Control Word to stack slot
int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
SDValue StackSlot =
DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
SDValue Chain = Op.getOperand(0);
SDValue Ops[] = {Chain, StackSlot};
Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
Align(2), MachineMemOperand::MOStore);
// Load FP Control Word from stack slot
SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
Chain = CWD.getValue(1);
// Mask and turn the control bits into a shift for the lookup table.
SDValue Shift =
DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,
CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
DAG.getConstant(9, DL, MVT::i8));
Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
SDValue RetVal =
DAG.getNode(ISD::AND, DL, MVT::i32,
DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
DAG.getConstant(3, DL, MVT::i32));
RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
return DAG.getMergeValues({RetVal, Chain}, DL);
/// Lower a vector CTLZ using native supported vector CTLZ instruction.
// i8/i16 vector implemented using dword LZCNT vector instruction
// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
// split the vector, perform operation on it's Lo a Hi part and
// concatenate the results.
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(Op.getOpcode() == ISD::CTLZ);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();
assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
"Unsupported element type");
// Split vector, it's Lo and Hi parts will be handled in next iteration.
if (NumElems > 16 ||
(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
return splitVectorIntUnary(Op, DAG);
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation");
// Use native supported vector instruction vplzcntd.
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
// Lower CTLZ using a PSHUFB lookup table implementation.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
int NumElts = VT.getVectorNumElements();
int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
// Per-nibble leading zero PSHUFB lookup table.
const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
SmallVector<SDValue, 64> LUTVec;
for (int i = 0; i < NumBytes; ++i)
LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
// Begin by bitcasting the input to byte vector, then split those bytes
// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
// If the hi input nibble is zero then we add both results together, otherwise
// we just take the hi result (by masking the lo result to zero before the
// add).
SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
SDValue Zero = DAG.getConstant(0, DL, CurrVT);
SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
SDValue Lo = Op0;
SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
SDValue HiZ;
if (CurrVT.is512BitVector()) {
MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
} else {
HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
// Merge result back from vXi8 back to VT, working on the lo/hi halves
// of the current vector width in the same way we did for the nibbles.
// If the upper half of the input element is zero then add the halves'
// leading zero counts together, otherwise just use the upper half's.
// Double the width of the result until we are at target width.
while (CurrVT != VT) {
int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
int CurrNumElts = CurrVT.getVectorNumElements();
MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
// Check if the upper half of the input element is zero.
if (CurrVT.is512BitVector()) {
MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
} else {
HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
HiZ = DAG.getBitcast(NextVT, HiZ);
// Move the upper/lower halves to the lower bits as we'll be extending to
// NextVT. Mask the lower result to zero if HiZ is true and add the results
// together.
SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
CurrVT = NextVT;
return Res;
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (Subtarget.hasCDI() &&
// vXi8 vectors need to be promoted to 512-bits for vXi32.
(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntUnary(Op, DAG);
// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG);
assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
unsigned Opc = Op.getOpcode();
if (VT.isVector())
return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
Op = Op.getOperand(0);
if (VT == MVT::i8) {
// Zero extend to i32 since there is not an i8 bsr.
OpVT = MVT::i32;
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
if (Opc == ISD::CTLZ) {
// If src is zero (i.e. bsr sets ZF), returns NumBits.
SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
// Finally xor with NumBits-1.
Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
DAG.getConstant(NumBits - 1, dl, OpVT));
if (VT == MVT::i8)
Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
return Op;
static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
unsigned NumBits = VT.getScalarSizeInBits();
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering");
// Issue a bsf (scan bits forward) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
// If src is zero (i.e. bsf sets ZF), returns NumBits.
SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
if (VT == MVT::i16 || VT == MVT::i32)
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
if (VT.getScalarType() == MVT::i1)
return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
Op.getOperand(0), Op.getOperand(1));
if (VT == MVT::v32i16 || VT == MVT::v64i8)
return splitVectorIntBinary(Op, DAG);
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return splitVectorIntBinary(Op, DAG);
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
if (VT.getScalarType() == MVT::i1) {
SDLoc dl(Op);
switch (Opcode) {
default: llvm_unreachable("Expected saturated arithmetic opcode");
// *addsat i1 X, Y --> X | Y
return DAG.getNode(ISD::OR, dl, VT, X, Y);
// *subsat i1 X, Y --> X & ~Y
return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
if (VT.is128BitVector()) {
// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), VT);
SDLoc DL(Op);
if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
// uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
// usubsat X, Y --> (X >u Y) ? X - Y : 0
SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
// Use default expansion.
return SDValue();
if (VT == MVT::v32i16 || VT == MVT::v64i8)
return splitVectorIntBinary(Op, DAG);
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return splitVectorIntBinary(Op, DAG);
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
// Since X86 does not have CMOV for 8-bit integer, we don't convert
// 8-bit integer abs to NEG and CMOV.
SDLoc DL(Op);
SDValue N0 = Op.getOperand(0);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
DAG.getConstant(0, DL, VT), N0);
SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
SDValue(Neg.getNode(), 1)};
return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
SDValue Sub =
DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
if (VT.is256BitVector() && !Subtarget.hasInt256()) {
assert(VT.isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return splitVectorIntUnary(Op, DAG);
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG);
// Default to expand.
return SDValue();
static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
// For AVX1 cases, split to use legal ops (everything but v4i64).
if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
return splitVectorIntBinary(Op, DAG);
if (VT == MVT::v32i16 || VT == MVT::v64i8)
return splitVectorIntBinary(Op, DAG);
SDLoc DL(Op);
unsigned Opcode = Op.getOpcode();
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
// For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
// using the SMIN/SMAX instructions and flipping the signbit back.
if (VT == MVT::v8i16) {
assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
"Unexpected MIN/MAX opcode");
SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
// Else, expand to a compare/select.
ISD::CondCode CC;
switch (Opcode) {
case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
default: llvm_unreachable("Unknown MINMAX opcode");
SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
return DAG.getSelect(DL, VT, Cond, N0, N1);
static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
if (VT.getScalarType() == MVT::i1)
return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntBinary(Op, DAG);
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
return splitVectorIntBinary(Op, DAG);
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
// vector pairs, multiply and truncate.
if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
unsigned NumElts = VT.getVectorNumElements();
if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
return DAG.getNode(
DAG.getNode(ISD::MUL, dl, ExVT,
DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
// Extract the lo/hi parts to any extend to i16.
// We're going to mask off the low byte of each result element of the
// pmullw, so it doesn't matter what's in the high byte of each 16-bit
// element.
SDValue Undef = DAG.getUNDEF(VT);
SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
SDValue BLo, BHi;
if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
// If the LHS is a constant, manually unpackl/unpackh.
SmallVector<SDValue, 16> LoOps, HiOps;
for (unsigned i = 0; i != NumElts; i += 16) {
for (unsigned j = 0; j != 8; ++j) {
LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
BLo = DAG.getBuildVector(ExVT, dl, LoOps);
BHi = DAG.getBuildVector(ExVT, dl, HiOps);
} else {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
// Multiply, mask the lower 8bits of the lo/hi results and pack.
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
if (VT == MVT::v4i32) {
assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!");
// Extract the odd parts.
static const int UnpackMask[] = { 1, -1, 3, -1 };
SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
// Multiply the even parts.
SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, A),
DAG.getBitcast(MVT::v2i64, B));
// Now multiply odd parts.
SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, Aodds),
DAG.getBitcast(MVT::v2i64, Bodds));
Evens = DAG.getBitcast(VT, Evens);
Odds = DAG.getBitcast(VT, Odds);
// Merge the two vectors back together with a shuffle. This expands into 2
// shuffles.
static const int ShufMask[] = { 0, 4, 2, 6 };
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");
assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
// AloBlo = pmuludq(a, b);
// AloBhi = pmuludq(a, Bhi);
// AhiBlo = pmuludq(Ahi, b);
// Hi = psllqi(AloBhi + AhiBlo, 32);
// return AloBlo + Hi;
KnownBits AKnown = DAG.computeKnownBits(A);
KnownBits BKnown = DAG.computeKnownBits(B);
APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
SDValue Zero = DAG.getConstant(0, dl, VT);
// Only multiply lo/hi halves that aren't known to be zero.
SDValue AloBlo = Zero;
if (!ALoIsZero && !BLoIsZero)
AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
SDValue AloBhi = Zero;
if (!ALoIsZero && !BHiIsZero) {
SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
SDValue AhiBlo = Zero;
if (!AHiIsZero && !BLoIsZero) {
SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
bool IsSigned = Op->getOpcode() == ISD::MULHS;
unsigned NumElts = VT.getVectorNumElements();
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntBinary(Op, DAG);
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
return splitVectorIntBinary(Op, DAG);
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
(VT == MVT::v16i32 && Subtarget.hasAVX512()));
// PMULxD operations multiply each even value (starting at 0) of LHS with
// the related value of RHS and produce a widen result.
// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
// In other word, to have all the results, we need to perform two PMULxD:
// 1. one with the even values.
// 2. one with the odd values.
// To achieve #2, with need to place the odd values at an even position.
// Place the odd value at an even position (basically, shift all values 1
// step to the left):
const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
9, -1, 11, -1, 13, -1, 15, -1};
// <a|b|c|d> => <b|undef|d|undef>
SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
makeArrayRef(&Mask[0], NumElts));
// <e|f|g|h> => <f|undef|h|undef>
SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
makeArrayRef(&Mask[0], NumElts));
// Emit two multiplies, one for the lower 2 ints and one for the higher 2
// ints.
MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
unsigned Opcode =
(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
DAG.getBitcast(MulVT, A),
DAG.getBitcast(MulVT, B)));
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
// => <2 x i64> <bf|dh>
SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
DAG.getBitcast(MulVT, Odd0),
DAG.getBitcast(MulVT, Odd1)));
// Shuffle it back into the right order.
SmallVector<int, 16> ShufMask(NumElts);
for (int i = 0; i != (int)NumElts; ++i)
ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
// If we have a signed multiply but no PMULDQ fix up the result of an
// unsigned multiply.
if (IsSigned && !Subtarget.hasSSE41()) {
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
return Res;
// Only i8 vectors should need custom lowering after this.
assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
"Unsupported vector type");
// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
// logical shift down the upper half and pack back to i8.
// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
// and then ashr/lshr the upper bits down to the lower bits before multiply.
unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
-1, -1, -1, -1, -1, -1, -1, -1};
// Extract the lo parts and zero/sign extend to i16.
// Only use SSE4.1 instructions for signed v16i8 where using unpack requires
// shifts to sign extend. Using unpack for unsigned only requires an xor to
// create zeros and a copy due to tied registers contraints pre-avx. But using
// zero_extend_vector_inreg would require an additional pshufd for the high
// part.
SDValue ALo, AHi;
if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
} else if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
} else {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
DAG.getConstant(0, dl, VT)));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
DAG.getConstant(0, dl, VT)));
SDValue BLo, BHi;
if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
// If the LHS is a constant, manually unpackl/unpackh and extend.
SmallVector<SDValue, 16> LoOps, HiOps;
for (unsigned i = 0; i != NumElts; i += 16) {
for (unsigned j = 0; j != 8; ++j) {
SDValue LoOp = B.getOperand(i + j);
SDValue HiOp = B.getOperand(i + j + 8);
if (IsSigned) {
LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
} else {
LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
BLo = DAG.getBuildVector(ExVT, dl, LoOps);
BHi = DAG.getBuildVector(ExVT, dl, HiOps);
} else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
} else if (IsSigned) {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
} else {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
DAG.getConstant(0, dl, VT)));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
DAG.getConstant(0, dl, VT)));
// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
// pack back to vXi8.
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
// Bitcast back to VT and then pack all the even elements from Lo and Hi.
return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget.isTargetWin64() && "Unexpected target");
EVT VT = Op.getValueType();
assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering");
RTLIB::Libcall LC;
bool isSigned;
switch (Op->getOpcode()) {
default: llvm_unreachable("Unexpected request for libcall!");
case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
SDLoc dl(Op);
SDValue InChain = DAG.getEntryNode();
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
EVT ArgVT = Op->getOperand(i).getValueType();
assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering");
SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
Entry.Node = StackPtr;
InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
MPI, /* Alignment = */ 16);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
Entry.IsSExt = false;
Entry.IsZExt = false;
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
TargetLowering::CallLoweringInfo CLI(DAG);
static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
return DAG.getBitcast(VT, CallInfo.first);
// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
if (VT.getScalarSizeInBits() < 16)
return false;
if (VT.is512BitVector() && Subtarget.hasAVX512() &&
(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
return true;
bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
(VT.is256BitVector() && Subtarget.hasInt256());
bool AShift = LShift && (Subtarget.hasAVX512() ||
(VT != MVT::v2i64 && VT != MVT::v4i64));
return (Opcode == ISD::SRA) ? AShift : LShift;
// The shift amount is a variable, but it is the same for all vector lanes.
// These instructions are defined together with shift-immediate.
bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
// Return true if the required (according to Opcode) variable-shift form is
// natively supported by the Subtarget
static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
unsigned Opcode) {
if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
return false;
// vXi16 supported only on AVX-512, BWI
if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
return false;
if (Subtarget.hasAVX512())
return true;
bool LShift = VT.is128BitVector() || VT.is256BitVector();
bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
return (Opcode == ISD::SRA) ? AShift : LShift;
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
SDValue Ex = DAG.getBitcast(ExVT, R);
// ashr(R, 63) === cmp_slt(R, 0)
if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
"Unsupported PCMPGT op");
return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
if (ShiftAmt >= 32) {
// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
SDValue Upper =
getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
ShiftAmt - 32, DAG);
if (VT == MVT::v2i64)
Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
if (VT == MVT::v4i64)
Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
{9, 1, 11, 3, 13, 5, 15, 7});
} else {
// SRA upper i32, SRL whole i64 and select lower i32.
SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
ShiftAmt, DAG);
SDValue Lower =
getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
Lower = DAG.getBitcast(ExVT, Lower);
if (VT == MVT::v2i64)
Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
if (VT == MVT::v4i64)
Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
{8, 1, 10, 3, 12, 5, 14, 7});
return DAG.getBitcast(VT, Ex);
// Optimize shl/srl/sra with constant shift amount.
APInt APIntShiftAmt;
if (!X86::isConstantSplat(Amt, APIntShiftAmt))
return SDValue();
// If the shift amount is out of range, return undef.
if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
return DAG.getUNDEF(VT);
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
// i64 SRA needs to be performed as partial shifts.
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
Op.getOpcode() == ISD::SRA)
return ArithmeticShiftRight64(ShiftAmt);
if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
(Subtarget.hasBWI() && VT == MVT::v64i8)) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
// Simple i8 add case
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
return DAG.getNode(ISD::ADD, dl, VT, R, R);
// ashr(R, 7) === cmp_slt(R, 0)
if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
SDValue Zeros = DAG.getConstant(0, dl, VT);
if (VT.is512BitVector()) {
assert(VT == MVT::v64i8 && "Unexpected element type!");
SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
if (VT == MVT::v16i8 && Subtarget.hasXOP())
return SDValue();
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
ShiftAmt, DAG);
SHL = DAG.getBitcast(VT, SHL);
// Zero out the rightmost bits.
APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
ShiftAmt, DAG);
SRL = DAG.getBitcast(VT, SRL);
// Zero out the leftmost bits.
return DAG.getNode(ISD::AND, dl, VT, SRL,
DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
if (Op.getOpcode() == ISD::SRA) {
// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
llvm_unreachable("Unknown shift opcode.");
return SDValue();
static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
MVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
else if (EltVT.bitsLT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
// vXi8 shifts - shift as v8i16 + mask result.
if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
VT == MVT::v64i8) &&
!Subtarget.hasXOP()) {
unsigned NumElts = VT.getVectorNumElements();
MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
// Create the mask using vXi16 shifts. For shift-rights we need to move
// the upper byte down before splatting the vXi8 mask.
SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
BaseShAmt, Subtarget, DAG);
if (Opcode != ISD::SHL)
BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
8, DAG);
BitMask = DAG.getBitcast(VT, BitMask);
BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
SmallVector<int, 64>(NumElts, 0));
SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
DAG.getBitcast(ExtVT, R), BaseShAmt,
Subtarget, DAG);
Res = DAG.getBitcast(VT, Res);
Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
if (Opcode == ISD::SRA) {
// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
BaseShAmt, Subtarget, DAG);
SignMask = DAG.getBitcast(VT, SignMask);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
return Res;
// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
std::vector<SDValue> Vals(Ratio);
for (unsigned i = 0; i != Ratio; ++i)
Vals[i] = Amt.getOperand(i);
for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
for (unsigned j = 0; j != Ratio; ++j)
if (Vals[j] != Amt.getOperand(i + j))
return SDValue();
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
return SDValue();
// Convert a shift/rotate left amount to a multiplication scale factor.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Amt.getSimpleValueType();
if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
(Subtarget.hasInt256() && VT == MVT::v16i16) ||
(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
return SDValue();
if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
SmallVector<SDValue, 8> Elts;
MVT SVT = VT.getVectorElementType();
unsigned SVTBits = SVT.getSizeInBits();
APInt One(SVTBits, 1);
unsigned NumElems = VT.getVectorNumElements();
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Op = Amt->getOperand(i);
if (Op->isUndef()) {
ConstantSDNode *ND = cast<ConstantSDNode>(Op);
APInt C(SVTBits, ND->getZExtValue());
uint64_t ShAmt = C.getZExtValue();
if (ShAmt >= SVTBits) {
Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
return DAG.getBuildVector(VT, dl, Elts);
// If the target doesn't support variable shifts, use either FP conversion
// or integer multiplication to avoid shifting each element individually.
if (VT == MVT::v4i32) {
Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
DAG.getConstant(0x3f800000U, dl, VT));
Amt = DAG.getBitcast(MVT::v4f32, Amt);
return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
SDValue Z = DAG.getConstant(0, dl, VT);
SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
if (Subtarget.hasSSE41())
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
DAG.getBitcast(VT, Hi),
{0, 2, 4, 6, 8, 10, 12, 14});
return SDValue();
static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
unsigned Opc = Op.getOpcode();
unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
assert(VT.isVector() && "Custom lowering only for vector shifts!");
assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
return V;
if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
return V;
if (SupportedVectorVarShift(VT, Subtarget, Opc))
return Op;
// XOP has 128-bit variable logical/arithmetic shifts.
// +ve/-ve Amt = shift left/right.
if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
VT == MVT::v8i16 || VT == MVT::v16i8)) {
if (Opc == ISD::SRL || Opc == ISD::SRA) {
SDValue Zero = DAG.getConstant(0, dl, VT);
Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
if (Opc == ISD::SHL || Opc == ISD::SRL)
return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
if (Opc == ISD::SRA)
return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
// 2i64 vector logical shifts can efficiently avoid scalarization - do the
// shifts per-lane and then shuffle the partial results back together.
if (VT == MVT::v2i64 && Opc != ISD::SRA) {
// Splat the shift amounts so the scalar shifts above will catch it.
SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
// i64 vector arithmetic shift can be emulated with the transform:
// M = lshr(SIGN_MASK, Amt)
// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
Opc == ISD::SRA) {
SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
R = DAG.getNode(ISD::XOR, dl, VT, R, M);
R = DAG.getNode(ISD::SUB, dl, VT, R, M);
return R;
// If possible, lower this shift as a sequence of two shifts by
// constant plus a BLENDing shuffle instead of scalarizing it.
// Example:
// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
// Could be rewritten as:
// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
// The advantage is that the two shifts from the example would be
// lowered as X86ISD::VSRLI nodes in parallel before blending.
if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
SDValue Amt1, Amt2;
unsigned NumElts = VT.getVectorNumElements();
SmallVector<int, 8> ShuffleMask;
for (unsigned i = 0; i != NumElts; ++i) {
SDValue A = Amt->getOperand(i);
if (A.isUndef()) {
if (!Amt1 || Amt1 == A) {
Amt1 = A;
if (!Amt2 || Amt2 == A) {
ShuffleMask.push_back(i + NumElts);
Amt2 = A;
// Only perform this blend if we can perform it without loading a mask.
if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
(VT != MVT::v16i16 ||
is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
canWidenShuffleElements(ShuffleMask))) {
auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
Cst2->getAPIntValue().ult(EltSizeInBits)) {
SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
Cst1->getZExtValue(), DAG);
SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
Cst2->getZExtValue(), DAG);
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
if (Opc == ISD::SHL)
if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
if (Opc == ISD::SRL && ConstantAmt &&
(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
SDValue Zero = DAG.getConstant(0, dl, VT);
SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
return DAG.getSelect(dl, VT, ZAmt, R, Res);
// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
// TODO: Special case handling for shift by 0/1, really we can afford either
// of these cases in pre-SSE41/XOP/AVX512 but not both.
if (Opc == ISD::SRA && ConstantAmt &&
(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
!Subtarget.hasAVX512()) ||
DAG.isKnownNeverZero(Amt))) {
SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
SDValue Amt0 =
DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
SDValue Amt1 =
DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
SDValue Sra1 =
getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
Res = DAG.getSelect(dl, VT, Amt0, R, Res);
return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
// v4i32 Non Uniform Shifts.
// If the shift amount is constant we can shift each lane using the SSE2
// immediate shifts, else we need to zero-extend each lane to the lower i64
// and shift using the SSE2 variable shifts.
// The separate results can then be blended together.
if (VT == MVT::v4i32) {
SDValue Amt0, Amt1, Amt2, Amt3;
if (ConstantAmt) {
Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
} else {
// The SSE2 shifts use the lower i64 as the same shift amount for
// all lanes and the upper i64 is ignored. On AVX we're better off
// just zero-extending, but for SSE just duplicating the top 16-bits is
// cheaper and has the same effect for out of range values.
if (Subtarget.hasAVX()) {
SDValue Z = DAG.getConstant(0, dl, VT);
Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
} else {
SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
{4, 5, 6, 7, -1, -1, -1, -1});
Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
{0, 1, 1, 1, -1, -1, -1, -1});
Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
{2, 3, 3, 3, -1, -1, -1, -1});
Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
{0, 1, 1, 1, -1, -1, -1, -1});
Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
{2, 3, 3, 3, -1, -1, -1, -1});
unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
// Merge the shifted lane results optimally with/without PBLENDW.
// TODO - ideally shuffle combining would handle this.
if (Subtarget.hasSSE41()) {
SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
// It's worth extending once and using the vXi16/vXi32 shifts for smaller
// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
// make the existing SSE solution better.
// NOTE: We honor prefered vector width before promoting to 512-bits.
if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
"Unexpected vector type");
MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
R = DAG.getNode(ExtOpc, dl, ExtVT, R);
Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getNode(Opc, dl, ExtVT, R, Amt));
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
!Subtarget.hasXOP()) {
int NumElts = VT.getVectorNumElements();
SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
// Extend constant shift amount to vXi16 (it doesn't matter if the type
// isn't legal).
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected");
if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
: DAG.getZExtOrTrunc(R, dl, ExVT);
R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
return DAG.getZExtOrTrunc(R, dl, VT);
SmallVector<SDValue, 16> LoAmt, HiAmt;
for (int i = 0; i != NumElts; i += 16) {
for (int j = 0; j != 8; ++j) {
LoAmt.push_back(Amt.getOperand(i + j));
HiAmt.push_back(Amt.getOperand(i + j + 8));
MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
if (VT.is512BitVector()) {
// On AVX512BW targets we make use of the fact that VSELECT lowers
// to a masked blend which selects bytes based just on the sign bit
// extracted to a mask.
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
} else if (Subtarget.hasSSE41()) {
// On SSE41 targets we can use PBLENDVB which selects bytes based just
// on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
return DAG.getBitcast(SelVT,
DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue Z = DAG.getConstant(0, dl, SelVT);
SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
return DAG.getSelect(dl, SelVT, C, V0, V1);
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
// We can safely do this using i16 shifts as we're only interested in
// the 3 lower bits of each byte.
Amt = DAG.getBitcast(ExtVT, Amt);
Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
Amt = DAG.getBitcast(VT, Amt);
if (Opc == ISD::SHL || Opc == ISD::SRL) {
// r = VSELECT(r, shift(r, 4), a);
SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
R = SignBitSelect(VT, Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// r = VSELECT(r, shift(r, 2), a);
M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
R = SignBitSelect(VT, Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// return VSELECT(r, shift(r, 1), a);
M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
R = SignBitSelect(VT, Amt, M, R);
return R;
if (Opc == ISD::SRA) {
// For SRA we need to unpack each byte to the higher byte of a i16 vector
// so we can correctly sign extend. We don't care what happens to the
// lower byte.
SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
ALo = DAG.getBitcast(ExtVT, ALo);
AHi = DAG.getBitcast(ExtVT, AHi);
RLo = DAG.getBitcast(ExtVT, RLo);
RHi = DAG.getBitcast(ExtVT, RHi);
// r = VSELECT(r, shift(r, 4), a);
SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
// a += a
ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
// r = VSELECT(r, shift(r, 2), a);
MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
// a += a
ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
// r = VSELECT(r, shift(r, 1), a);
MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
// Logical shift the result back to the lower byte, leaving a zero upper
// byte meaning that we can safely pack with PACKUSWB.
RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
MVT ExtVT = MVT::v8i32;
SDValue Z = DAG.getConstant(0, dl, VT);
SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
ALo = DAG.getBitcast(ExtVT, ALo);
AHi = DAG.getBitcast(ExtVT, AHi);
RLo = DAG.getBitcast(ExtVT, RLo);
RHi = DAG.getBitcast(ExtVT, RHi);
SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
if (VT == MVT::v8i16) {
// If we have a constant shift amount, the non-SSE41 path is best as
// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
bool UseSSE41 = Subtarget.hasSSE41() &&
auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
// On SSE41 targets we can use PBLENDVB which selects bytes based just on
// the sign bit.
if (UseSSE41) {
MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
V0 = DAG.getBitcast(ExtVT, V0);
V1 = DAG.getBitcast(ExtVT, V1);
Sel = DAG.getBitcast(ExtVT, Sel);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
// On pre-SSE41 targets we splat the sign bit - a negative value will
// set all bits of the lanes to true and VSELECT uses that in
// its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue C =
getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
return DAG.getSelect(dl, VT, C, V0, V1);
// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
if (UseSSE41) {
// On SSE41 targets we need to replicate the shift mask in both
// bytes for PBLENDVB.
Amt = DAG.getNode(
ISD::OR, dl, VT,
getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
} else {
Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
// r = VSELECT(r, shift(r, 8), a);
SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
R = SignBitSelect(Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// r = VSELECT(r, shift(r, 4), a);
M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
R = SignBitSelect(Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// r = VSELECT(r, shift(r, 2), a);
M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
R = SignBitSelect(Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
// return VSELECT(r, shift(r, 1), a);
M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
R = SignBitSelect(Amt, M, R);
return R;
// Decompose 256-bit shifts into 128-bit shifts.
if (VT.is256BitVector())
return splitVectorIntBinary(Op, DAG);
if (VT == MVT::v32i16 || VT == MVT::v64i8)
return splitVectorIntBinary(Op, DAG);
return SDValue();
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert(VT.isVector() && "Custom lowering only for vector rotates!");
SDLoc DL(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
unsigned EltSizeInBits = VT.getScalarSizeInBits();
int NumElts = VT.getVectorNumElements();
// Check for constant splat rotation amount.
APInt CstSplatValue;
bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
// Check for splat rotate by zero.
if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
return R;
// AVX512 implicitly uses modulo rotation amounts.
if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
// Attempt to rotate by immediate.
if (IsCstSplat) {
unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
return DAG.getNode(RotOpc, DL, VT, R,
DAG.getTargetConstant(RotAmt, DL, MVT::i8));
// Else, fall-back on VPROLV/VPRORV.
return Op;
assert((Opcode == ISD::ROTL) && "Only ROTL supported");
// XOP has 128-bit vector variable + immediate rotates.
// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
// XOP implicitly uses modulo rotation amounts.
if (Subtarget.hasXOP()) {
if (VT.is256BitVector())
return splitVectorIntBinary(Op, DAG);
assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
// Attempt to rotate by immediate.
if (IsCstSplat) {
uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
DAG.getTargetConstant(RotAmt, DL, MVT::i8));
// Use general rotate by variable (per-element).
return Op;
// Split 256-bit integers on pre-AVX2 targets.
if (VT.is256BitVector() && !Subtarget.hasAVX2())
return splitVectorIntBinary(Op, DAG);
assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported");
// Rotate by an uniform constant - expand back to shifts.
if (IsCstSplat)
return SDValue();
bool IsSplatAmt = DAG.isSplatValue(Amt);
// v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
// the amount bit.
if (EltSizeInBits == 8 && !IsSplatAmt) {
if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
return SDValue();
// We don't need ModuloAmt here as we just peek at individual bits.
MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
if (Subtarget.hasSSE41()) {
// On SSE41 targets we can use PBLENDVB which selects bytes based just
// on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
return DAG.getBitcast(SelVT,
DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue Z = DAG.getConstant(0, DL, SelVT);
SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
return DAG.getSelect(DL, SelVT, C, V0, V1);
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
// We can safely do this using i16 shifts as we're only interested in
// the 3 lower bits of each byte.
Amt = DAG.getBitcast(ExtVT, Amt);
Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
Amt = DAG.getBitcast(VT, Amt);
// r = VSELECT(r, rot(r, 4), a);
SDValue M;
M = DAG.getNode(
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
R = SignBitSelect(VT, Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
// r = VSELECT(r, rot(r, 2), a);
M = DAG.getNode(
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
R = SignBitSelect(VT, Amt, M, R);
// a += a
Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
// return VSELECT(r, rot(r, 1), a);
M = DAG.getNode(
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
return SignBitSelect(VT, Amt, M, R);
// ISD::ROT* uses modulo rotate amounts.
Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
DAG.getConstant(EltSizeInBits - 1, DL, VT));
bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
// Fallback for splats + all supported variable shifts.
// Fallback for non-constants AVX2 vXi16 as well.
if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
// As with shifts, convert the rotation amount to a multiplication factor.
SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
assert(Scale && "Failed to convert ROTL amount to scale");
// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
if (EltSizeInBits == 16) {
SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
// to v2i64 results at a time. The upper 32-bits contain the wrapped bits
// that can then be OR'd with the lower 32-bits.
assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
static const int OddMask[] = {1, -1, 3, -1};
SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, R),
DAG.getBitcast(MVT::v2i64, Scale));
SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
DAG.getBitcast(MVT::v2i64, R13),
DAG.getBitcast(MVT::v2i64, Scale13));
Res02 = DAG.getBitcast(VT, Res02);
Res13 = DAG.getBitcast(VT, Res13);
return DAG.getNode(ISD::OR, DL, VT,
DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
/// Returns true if the operand type is exactly twice the native width, and
/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
if (OpWidth == 128)
return Subtarget.hasCmpxchg16b();
return false;
bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();
bool NoImplicitFloatOps =
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return false;
return needsCmpXchgNb(MemType);
// Note: this turns large loads into lock cmpxchg8b/16b.
// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
Type *MemType = LI->getType();
// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
// can use movq to do the load. If we have X87 we can load into an 80-bit
// X87 register and store it to a stack temporary.
bool NoImplicitFloatOps =
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
(Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
// and default to library calls otherwise.
if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
AtomicRMWInst::BinOp Op = AI->getOperation();
switch (Op) {
llvm_unreachable("Unknown atomic operation");
case AtomicRMWInst::Xchg:
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
// It's better to use xadd, xsub or xchg for these in all cases.
return AtomicExpansionKind::None;
case AtomicRMWInst::Or:
case AtomicRMWInst::And:
case AtomicRMWInst::Xor:
// If the atomicrmw's result isn't actually used, we can just add a "lock"
// prefix to a normal instruction for these operations.
return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
case AtomicRMWInst::Nand:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
case AtomicRMWInst::FAdd:
case AtomicRMWInst::FSub:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
return AtomicExpansionKind::CmpXChg;
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
// harmful as it introduces a mfence.
if (MemType->getPrimitiveSizeInBits() > NativeWidth)
return nullptr;
// If this is a canonical idempotent atomicrmw w/no uses, we have a better
// lowering available in lowerAtomicArith.
// TODO: push more cases through this path.
if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
return nullptr;
IRBuilder<> Builder(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
// Before the load we need a fence. Here is an example lifted from
// showing why a fence
// is required:
// Thread 0:
//, relaxed);
// r1 = y.fetch_add(0, release);
// Thread 1:
// y.fetch_add(42, acquire);
// r2 = x.load(relaxed);
// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
// lowered to just a load without a fence. A mfence flushes the store buffer,
// making the optimization clearly correct.
// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
if (SSID == SyncScope::SingleThread)
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
if (!Subtarget.hasMFence())
// FIXME: it might make sense to use a locked operation here but on a
// different cache-line to prevent cache-line bouncing. In practice it
// is probably a small win, and x86 processors without mfence are rare
// enough that we do not bother.
return nullptr;
Function *MFence =
llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence, {});
// Finally we can emit the atomic load.
LoadInst *Loaded =
Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
Loaded->setAtomic(Order, SSID);
return Loaded;
bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
if (!SI.isUnordered())
return false;
return ExperimentalUnorderedISEL;
bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
if (!LI.isUnordered())
return false;
return ExperimentalUnorderedISEL;
/// Emit a locked operation on a stack location which does not change any
/// memory location, but does involve a lock prefix. Location is chosen to be
/// a) very likely accessed only by a single thread to minimize cache traffic,
/// and b) definitely dereferenceable. Returns the new Chain result.
static SDValue emitLockedStackOp(SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SDValue Chain, SDLoc DL) {
// Implementation notes:
// 1) LOCK prefix creates a full read/write reordering barrier for memory
// operations issued by the current processor. As such, the location
// referenced is not relevant for the ordering properties of the instruction.
// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
// Loads and Stores Are Not Reordered with Locked Instructions
// 2) Using an immediate operand appears to be the best encoding choice
// here since it doesn't require an extra register.
// 3) OR appears to be very slightly faster than ADD. (Though, the difference
// is small enough it might just be measurement noise.)
// 4) When choosing offsets, there are several contributing factors:
// a) If there's no redzone, we default to TOS. (We could allocate a cache
// line aligned stack object to improve this case.)
// b) To minimize our chances of introducing a false dependence, we prefer
// to offset the stack usage from TOS slightly.
// c) To minimize concerns about cross thread stack usage - in particular,
// the idiomatic[&StackVars]() {...}) pattern which
// captures state in the TOS frame and accesses it from many threads -
// we want to use an offset such that the offset is in a distinct cache
// line from the TOS frame.
// For a general discussion of the tradeoffs and benchmark results, see:
auto &MF = DAG.getMachineFunction();
auto &TFL = *Subtarget.getFrameLowering();
const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
if (Subtarget.is64Bit()) {
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
SDValue Ops[] = {
DAG.getRegister(X86::RSP, MVT::i64), // Base
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
DAG.getRegister(0, MVT::i64), // Index
DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
DAG.getRegister(0, MVT::i16), // Segment.
SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
MVT::Other, Ops);
return SDValue(Res, 1);
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
SDValue Ops[] = {
DAG.getRegister(X86::ESP, MVT::i32), // Base
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
DAG.getRegister(0, MVT::i32), // Index
DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
DAG.getRegister(0, MVT::i16), // Segment.
SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
MVT::Other, Ops);
return SDValue(Res, 1);
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
AtomicOrdering FenceOrdering =
SyncScope::ID FenceSSID =
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
return emitLockedStackOp(DAG, Subtarget, Chain, dl);
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT T = Op.getSimpleValueType();
SDLoc DL(Op);
unsigned Reg = 0;
unsigned size = 0;
switch(T.SimpleTy) {
default: llvm_unreachable("Invalid value type!");
case MVT::i8: Reg = X86::AL; size = 1; break;
case MVT::i16: Reg = X86::AX; size = 2; break;
case MVT::i32: Reg = X86::EAX; size = 4; break;
case MVT::i64:
assert(Subtarget.is64Bit() && "Node not type legal!");
Reg = X86::RAX; size = 8;
SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
Op.getOperand(2), SDValue());
SDValue Ops[] = { cpIn.getValue(0),
DAG.getTargetConstant(size, DL, MVT::i8),
cpIn.getValue(1) };
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
Ops, T, MMO);
SDValue cpOut =
DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
MVT::i32, cpOut.getValue(2));
SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
cpOut, Success, EFLAGS.getValue(1));
// Create MOVMSKB, taking into account whether we need to split for AVX1.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT InVT = V.getSimpleValueType();
if (InVT == MVT::v64i8) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
DAG.getConstant(32, DL, MVT::i8));
return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
DAG.getConstant(16, DL, MVT::i8));
return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
// half to v32i1 and concatenating the result.
if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
assert(Subtarget.hasBWI() && "Expected BWI target");
SDLoc dl(Op);
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
DAG.getIntPtrConstant(0, dl));
Lo = DAG.getBitcast(MVT::v32i1, Lo);
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
DAG.getIntPtrConstant(1, dl));
Hi = DAG.getBitcast(MVT::v32i1, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
// Use MOVMSK for vector to scalar conversion to prevent scalarization.
if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
SDLoc DL(Op);
SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
V = getPMOVMSKB(DL, V, DAG, Subtarget);
return DAG.getZExtOrTrunc(V, DL, DstVT);
assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
SrcVT == MVT::i64) && "Unexpected VT!");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
!(DstVT == MVT::x86mmx && SrcVT.isVector()))
// This conversion needs to be expanded.
return SDValue();
SDLoc dl(Op);
if (SrcVT.isVector()) {
// Widen the vector in input in the case of MVT::v2i32.
// Example: from MVT::v2i32 to MVT::v4i32.
MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
SrcVT.getVectorNumElements() * 2);
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
} else {
assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST");
Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
if (DstVT == MVT::x86mmx)
return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
DAG.getIntPtrConstant(0, dl));
/// Compute the horizontal sum of bytes in V for the elements of VT.
/// Requires V to be a byte vector and VT to be an integer vector type with
/// wider elements than V's type. The width of the elements of VT determines
/// how many bytes of V are summed horizontally to produce each element of the
/// result.
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(V);
MVT ByteVecVT = V.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
"Expected value to have byte element type.");
assert(EltVT != MVT::i8 &&
"Horizontal byte sum only makes sense for wider elements!");
unsigned VecSize = VT.getSizeInBits();
assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
// PSADBW instruction horizontally add all bytes and leave the result in i64
// chunks, thus directly computes the pop count for v2i64 and v4i64.
if (EltVT == MVT::i64) {
SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
return DAG.getBitcast(VT, V);
if (EltVT == MVT::i32) {
// We unpack the low half and high half into i32s interleaved with zeros so
// that we can use PSADBW to horizontally sum them. The most useful part of
// this is that it lines up the results of two PSADBW instructions to be
// two v2i64 vectors which concatenated are the 4 population counts. We can
// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
SDValue Zeros = DAG.getConstant(0, DL, VT);
SDValue V32 = DAG.getBitcast(VT, V);
SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
// Do the horizontal sums into two v2i64s.
Zeros = DAG.getConstant(0, DL, ByteVecVT);
MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
DAG.getBitcast(ByteVecVT, Low), Zeros);
High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
DAG.getBitcast(ByteVecVT, High), Zeros);
// Merge them together.
MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
DAG.getBitcast(ShortVecVT, Low),
DAG.getBitcast(ShortVecVT, High));
return DAG.getBitcast(VT, V);
// The only element type left is i16.
assert(EltVT == MVT::i16 && "Unknown how to handle type");
// To obtain pop count for each i16 element starting from the pop count for
// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
// right by 8. It is important to shift as i16s as i8 vector shift isn't
// directly supported.
SDValue ShifterV = DAG.getConstant(8, DL, VT);
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
DAG.getBitcast(ByteVecVT, V));
return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
int NumElts = VT.getVectorNumElements();
assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
// Implement a lookup table in register by using an algorithm based on:
// The general idea is that every lower byte nibble in the input vector is an
// index into a in-register pre-computed pop count table. We then split up the
// input vector in two new ones: (1) a vector with only the shifted-right
// higher nibbles for each byte and (2) a vector with the lower nibbles (and
// masked out higher ones) for each byte. PSHUFB is used separately with both
// to index the in-register table. Next, both are added and the result is a
// i8 vector where each element contains the pop count for input byte.
const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
SmallVector<SDValue, 64> LUTVec;
for (int i = 0; i < NumElts; ++i)
LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
SDValue M0F = DAG.getConstant(0x0F, DL, VT);
// High nibbles
SDValue FourV = DAG.getConstant(4, DL, VT);
SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
// Low nibbles
SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
// The input vector is used as the shuffle mask that index elements into the
// LUT. After counting low and high nibbles, add the vector to obtain the
// final pop count per i8 element.
SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
"Unknown CTPOP type to handle");
SDLoc DL(Op.getNode());
SDValue Op0 = Op.getOperand(0);
// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
if (Subtarget.hasVPOPCNTDQ()) {
unsigned NumElems = VT.getVectorNumElements();
assert((VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16) && "Unexpected type");
if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntUnary(Op, DAG);
// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG);
// For element types greater than i8, do vXi8 pop counts and a bytesum.
if (VT.getScalarType() != MVT::i8) {
MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
// We can't use the fast LUT approach, so fall back on LegalizeDAG.
if (!Subtarget.hasSSSE3())
return SDValue();
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Op.getSimpleValueType().isVector() &&
"We only do custom lowering for vector population count.");
return LowerVectorCTPOP(Op, Subtarget, DAG);
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
// For scalars, its still beneficial to transfer to/from the SIMD unit to
// perform the BITREVERSE.
if (!VT.isVector()) {
MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
DAG.getIntPtrConstant(0, DL));
int NumElts = VT.getVectorNumElements();
int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector())
return splitVectorIntUnary(Op, DAG);
assert(VT.is128BitVector() &&
"Only 128-bit vector bitreverse lowering supported.");
// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
// perform the BSWAP in the shuffle.
// Its best to shuffle using the second operand as this will implicitly allow
// memory folding for multiple vectors.
SmallVector<SDValue, 16> MaskElts;
for (int i = 0; i != NumElts; ++i) {
for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
int PermuteByte = SourceByte | (2 << 5);
MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
SDValue Res = DAG.getBitcast(MVT::v16i8, In);
Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
Res, Mask);
return DAG.getBitcast(VT, Res);
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
if (Subtarget.hasXOP() && !VT.is512BitVector())
return LowerBITREVERSE_XOP(Op, DAG);
assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
// Split v64i8 without BWI so that we can still use the PSHUFB lowering.
if (VT == MVT::v64i8 && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG);
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported");
// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitVectorIntUnary(Op, DAG);
// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
// two nibbles and a PSHUFB lookup to find the bitreverse of each
// 0-15 value (moved to the other nibble).
SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
const int LoLUT[16] = {
/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
const int HiLUT[16] = {
/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
for (unsigned i = 0; i < NumElts; ++i) {
LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NewOpc = 0;
switch (N->getOpcode()) {
NewOpc = X86ISD::LADD;
NewOpc = X86ISD::LSUB;
NewOpc = X86ISD::LOR;
NewOpc = X86ISD::LXOR;
NewOpc = X86ISD::LAND;
llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
return DAG.getMemIntrinsicNode(
NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
/*MemVT=*/N->getSimpleValueType(0), MMO);
/// Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
SDValue Chain = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
unsigned Opc = N->getOpcode();
MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
// can only be lowered when the result is unused. They should have already
// been transformed into a cmpxchg loop in AtomicExpand.
if (N->hasAnyUseOfValue(0)) {
// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
// select LXADD if LOCK_SUB can't be selected.
if (Opc == ISD::ATOMIC_LOAD_SUB) {
RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
RHS, AN->getMemOperand());
assert(Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!");
return N;
// Specialized lowering for the canonical form of an idemptotent atomicrmw.
// The core idea here is that since the memory location isn't actually
// changing, all we need is a lowering for the *ordering* impacts of the
// atomicrmw. As such, we can chose a different operation and memory
// location to minimize impact on other code.
if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
// On X86, the only ordering which actually requires an instruction is
// seq_cst which isn't SingleThread, everything just needs to be preserved
// during codegen and then dropped. Note that we expect (but don't assume),
// that orderings other than seq_cst and acq_rel have been canonicalized to
// a store or load.
if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
AN->getSyncScopeID() == SyncScope::System) {
// Prefer a locked operation against a stack location to minimize cache
// traffic. This assumes that stack locations are very likely to be
// accessed only by the owning thread.
SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
// NOTE: The getUNDEF is needed to give something for the unused result 0.
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
DAG.getUNDEF(VT), NewChain);
// MEMBARRIER is a compiler barrier; it codegens to a no-op.
SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
// NOTE: The getUNDEF is needed to give something for the unused result 0.
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
DAG.getUNDEF(VT), NewChain);
SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
// RAUW the chain, but don't worry about the result, as it's unused.
// NOTE: The getUNDEF is needed to give something for the unused result 0.
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
DAG.getUNDEF(VT), LockOp.getValue(1));
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
auto *Node = cast<AtomicSDNode>(Op.getNode());
SDLoc dl(Node);
EVT VT = Node->getMemoryVT();
bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
// If this store is not sequentially consistent and the type is legal
// we can just keep it.
if (!IsSeqCst && IsTypeLegal)
return Op;
if (VT == MVT::i64 && !IsTypeLegal) {
// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
// is enabled.
bool NoImplicitFloatOps =
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
SDValue Chain;
if (Subtarget.hasSSE1()) {
SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
SclToVec = DAG.getBitcast(StVT, SclToVec);
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
MVT::i64, Node->getMemOperand());
} else if (Subtarget.hasX87()) {
// First load this into an 80-bit X87 register using a stack temporary.
// This will put the whole integer into the significand.
SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
Chain =
DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
MPI, /*Align*/ 0, MachineMemOperand::MOStore);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue LdOps[] = {Chain, StackPtr};
SDValue Value =
DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
/*Align*/ None, MachineMemOperand::MOLoad);
Chain = Value.getValue(1);
// Now use an FIST to do the atomic store.
SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
Chain =
DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
StoreOps, MVT::i64, Node->getMemOperand());
if (Chain) {
// If this is a sequentially consistent store, also emit an appropriate
// barrier.
if (IsSeqCst)
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
return Chain;
// Convert seq_cst store -> xchg
// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
Node->getOperand(1), Node->getOperand(2),
return Swap.getValue(1);
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
MVT VT = N->getSimpleValueType(0);
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
SDLoc DL(N);
// Set the carry flag.
SDValue Carry = Op.getOperand(2);
EVT CarryVT = Carry.getValueType();
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
Carry, DAG.getAllOnesConstant(DL, CarryVT));
unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
Op.getOperand(1), Carry.getValue(1));
SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
// For MacOSX, we want to call an alternative entry point: __sincos_stret,
// which returns the values as { float, float } (in XMM0) or
// { double, double } (which is returned in XMM0, XMM1).
SDLoc dl(Op);
SDValue Arg = Op.getOperand(0);
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
TargetLowering::ArgListTy Args;
TargetLowering::ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
Entry.IsSExt = false;
Entry.IsZExt = false;
bool isF64 = ArgVT == MVT::f64;
// Only optimize x86_64 for now. i386 is a bit messy. For f32,
// the small struct {f32, f32} is returned in (eax, edx). For f64,
// the results are returned via SRet in memory.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const char *LibcallName = TLI.getLibcallName(LC);
SDValue Callee =
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
: (Type *)FixedVectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
if (isF64)
// Returned in xmm0 and xmm1.
return CallResult.first;
// Returned in bits 0:31 and 32:64 xmm0.
SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
CallResult.first, DAG.getIntPtrConstant(0, dl));
SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
CallResult.first, DAG.getIntPtrConstant(1, dl));
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
/// Widen a vector input to a vector of NVT. The
/// input vector must have the same element type as NVT.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
bool FillWithZeroes = false) {
// Check if InOp already has the right width.
MVT InVT = InOp.getSimpleValueType();
if (InVT == NVT)
return InOp;
if (InOp.isUndef())
return DAG.getUNDEF(NVT);
assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match");
unsigned InNumElts = InVT.getVectorNumElements();
unsigned WidenNumElts = NVT.getVectorNumElements();
assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
"Unexpected request for vector widening");
SDLoc dl(InOp);
if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
InOp.getNumOperands() == 2) {
SDValue N1 = InOp.getOperand(1);
if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
N1.isUndef()) {
InOp = InOp.getOperand(0);
InVT = InOp.getSimpleValueType();
InNumElts = InVT.getVectorNumElements();
if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
SmallVector<SDValue, 16> Ops;
for (unsigned i = 0; i < InNumElts; ++i)
EVT EltVT = InOp.getOperand(0).getValueType();
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
return DAG.getBuildVector(NVT, dl, Ops);
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
InOp, DAG.getIntPtrConstant(0, dl));
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");
MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
SDValue Src = N->getValue();
MVT VT = Src.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
SDLoc dl(Op);
SDValue Scale = N->getScale();
SDValue Index = N->getIndex();
SDValue Mask = N->getMask();
SDValue Chain = N->getChain();
SDValue BasePtr = N->getBasePtr();
if (VT == MVT::v2f32 || VT == MVT::v2i32) {
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
// If the index is v2i64 and we have VLX we can use xmm for data and index.
if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
N->getMemoryVT(), N->getMemOperand());
return SDValue();
MVT IndexVT = Index.getSimpleValueType();
// If the index is v2i32, we're being called by type legalization and we
// should just let the default handling take care of it.
if (IndexVT == MVT::v2i32)
return SDValue();
// If we don't have VLX and neither the passthru or index is 512-bits, we
// need to widen until one is.
if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
// Determine how much we need to widen by to get a 512-bit type.
unsigned Factor = std::min(512/VT.getSizeInBits(),
unsigned NumElts = VT.getVectorNumElements() * Factor;
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
Src = ExtendToType(Src, VT, DAG);
Index = ExtendToType(Index, IndexVT, DAG);
Mask = ExtendToType(Mask, MaskVT, DAG, true);
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
N->getMemoryVT(), N->getMemOperand());
static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
MVT VT = Op.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
MVT MaskVT = Mask.getSimpleValueType();
SDValue PassThru = N->getPassThru();
SDLoc dl(Op);
// Handle AVX masked loads which don't support passthru other than 0.
if (MaskVT.getVectorElementType() != MVT::i1) {
// We also allow undef in the isel pattern.
if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
return Op;
SDValue NewLoad = DAG.getMaskedLoad(
VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
// Emit a blend.
SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!");
assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
"Expanding masked load is supported for 32 and 64-bit types only!");
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.");
assert((ScalarVT.getSizeInBits() >= 32 ||
(Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
"Unsupported masked load op.");
// This operation is legal for targets with VLX, but without
// VLX the vector should be widened to 512 bit
unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
PassThru = ExtendToType(PassThru, WideDataVT, DAG);
// Mask element has to be i1.
assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type");
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
SDValue NewLoad = DAG.getMaskedLoad(
WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
N->getExtensionType(), N->isExpandingLoad());
SDValue Extract =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
DAG.getIntPtrConstant(0, dl));
SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
return DAG.getMergeValues(RetOps, dl);
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
SDValue DataToStore = N->getValue();
MVT VT = DataToStore.getSimpleValueType();
MVT ScalarVT = VT.getScalarType();
SDValue Mask = N->getMask();
SDLoc dl(Op);
assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!");
assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
"Expanding masked load is supported for 32 and 64-bit types only!");
assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.");
assert((ScalarVT.getSizeInBits() >= 32 ||
(Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
"Unsupported masked store op.");
// This operation is legal for targets with VLX, but without
// VLX the vector should be widened to 512 bit
unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
// Mask element has to be i1.
assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type");
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
N->getOffset(), Mask, N->getMemoryVT(),
N->getMemOperand(), N->getAddressingMode(),
N->isTruncatingStore(), N->isCompressingStore());
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX2() &&
"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
SDValue Index = N->getIndex();
SDValue Mask = N->getMask();
SDValue PassThru = N->getPassThru();
MVT IndexVT = Index.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
// If the index is v2i32, we're being called by type legalization.
if (IndexVT == MVT::v2i32)
return SDValue();
// If we don't have VLX and neither the passthru or index is 512-bits, we
// need to widen until one is.
MVT OrigVT = VT;
if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
!IndexVT.is512BitVector()) {
// Determine how much we need to widen by to get a 512-bit type.
unsigned Factor = std::min(512/VT.getSizeInBits(),
unsigned NumElts = VT.getVectorNumElements() * Factor;
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
PassThru = ExtendToType(PassThru, VT, DAG);
Index = ExtendToType(Index, IndexVT, DAG);
Mask = ExtendToType(Mask, MaskVT, DAG, true);
SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
N->getScale() };
SDValue NewGather = DAG.getMemIntrinsicNode(
X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
NewGather, DAG.getIntPtrConstant(0, dl));
return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
SDValue Src = Op.getOperand(0);
MVT DstVT = Op.getSimpleValueType();
AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
unsigned SrcAS = N->getSrcAddressSpace();
assert(SrcAS != N->getDestAddressSpace() &&
"addrspacecast must be between different address spaces");
if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
} else if (DstVT == MVT::i64) {
Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
} else if (DstVT == MVT::i32) {
Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
} else {
report_fatal_error("Bad address space in addrspacecast");
return Op;
SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
SelectionDAG &DAG) const {
// TODO: Eventually, the lowering of these nodes should be informed by or
// deferred to the GC strategy for the function in which they appear. For
// now, however, they must be lowered to something. Since they are logically
// no-ops in the case of a null GC strategy (or a GC strategy which does not
// require special handling for these nodes), lower them as literal NOOPs for
// the time being.
SmallVector<SDValue, 2> Ops;
if (Op->getGluedNode())
Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
SDLoc OpDL(Op);
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
return NOOP;
SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
bool IsStrict = Op->isStrictFPOpcode();
unsigned Offset = IsStrict ? 1 : 0;
SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
SDLoc dl(Op);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
MakeLibCallOptions CallOptions;
std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
CallOptions, dl, Chain);
if (IsStrict)
return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
return Tmp.first;
// Custom split CVTPS2PH with wide types.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
EVT VT = Op.getValueType();
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
SDValue RC = Op.getOperand(1);
Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
return LowerCMP_SWAP(Op, Subtarget, DAG);
case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::FSHL:
case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
case ISD::FP16_TO_FP:
case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
case ISD::FP_TO_FP16:
case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:
case ISD::FSUB: return lowerFaddFsub(Op, DAG);
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::LRINT:
case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
case ISD::SETCC:
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::CTLZ:
case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
case ISD::MULHS:
case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
case ISD::ROTL:
case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
case ISD::USUBO:
case ISD::SMULO:
case ISD::UMULO: return LowerXALUO(Op, DAG);
case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
case ISD::ADD:
case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
case ISD::UMIN: return LowerMINMAX(Op, DAG);
case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
/// Places new result values for the node in Results (their number
/// and types must exactly match those of the original return values of
/// the node), or leaves Results empty, which indicates that the node is not
/// to be custom lowered after all.
void X86TargetLowering::LowerOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDValue Res = LowerOperation(SDValue(N, 0), DAG);
if (!Res.getNode())
// If the original node has one result, take the return value from
// LowerOperation as is. It might not be result number 0.
if (N->getNumValues() == 1) {
// If the original node has multiple results, then the return node should
// have the same number of results.
assert((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!");
// Places new result values base on N result number.
for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
/// Replace a node with an illegal result type with a new node built out of
/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SelectionDAG &DAG) const {
SDLoc dl(N);
switch (N->getOpcode()) {
#ifndef NDEBUG
dbgs() << "ReplaceNodeResults: ";
llvm_unreachable("Do not know how to custom type legalize this operation!");
case X86ISD::CVTPH2PS: {
EVT VT = N->getValueType(0);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
EVT VT = N->getValueType(0);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
{N->getOperand(0), Lo});
Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
{N->getOperand(0), Hi});
SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Lo.getValue(1), Hi.getValue(1));
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// Use a v2i64 if possible.
bool NoImplicitFloatOps =
if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
SDValue Wide =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
// Bit count should fit in 32-bits, extract it as that and then zero
// extend to i64. Otherwise we end up extracting bits 63:32 separately.
Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
DAG.getIntPtrConstant(0, dl));
Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
case ISD::MUL: {
EVT VT = N->getValueType(0);
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
// Pre-promote these to vXi16 to avoid op legalization thinking all 16
// elements are needed.
MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
unsigned NumConcats = 16 / VT.getVectorNumElements();
SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
ConcatOps[0] = Res;
Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
case X86ISD::AVG: {
// Legalize types for X86ISD::AVG/VPMADDWD by widening.
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT VT = N->getValueType(0);
EVT InVT = N->getOperand(0).getValueType();
assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
"Expected a VT that divides into 128 bits.");
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
unsigned NumConcat = 128 / InVT.getSizeInBits();
EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
NumConcat * InVT.getVectorNumElements());
EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
NumConcat * VT.getVectorNumElements());
SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
Ops[0] = N->getOperand(0);
SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
Ops[0] = N->getOperand(1);
SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
case ISD::ABS: {
assert(N->getValueType(0) == MVT::i64 &&
"Unexpected type (!= i64) on ABS.");
MVT HalfT = MVT::i32;
SDValue Lo, Hi, Tmp;
SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
DAG.getConstant(0, dl, HalfT));
Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
DAG.getConstant(1, dl, HalfT));
Tmp = DAG.getNode(
ISD::SRA, dl, HalfT, Hi,
DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
SDValue(Lo.getNode(), 1));
Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
case X86ISD::FMINC:
case X86ISD::FMIN:
case X86ISD::FMAXC:
case X86ISD::FMAX: {
EVT VT = N->getValueType(0);
assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
N->getOperand(0), UNDEF);
SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
N->getOperand(1), UNDEF);
Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM: {
EVT VT = N->getValueType(0);
if (VT.isVector()) {
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
// If this RHS is a constant splat vector we can widen this and let
// division/remainder by constant optimize it.
// TODO: Can we do something for non-splat?
APInt SplatVal;
if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
unsigned NumConcats = 128 / VT.getSizeInBits();
SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
Ops0[0] = N->getOperand(0);
EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
case ISD::UDIVREM: {
SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
MVT VT = N->getSimpleValueType(0);
if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
// The generic legalizer will try to widen the input type to the same
// number of elements as the widened result type. But this isn't always
// the best thing so do some custom legalization to avoid some cases.
MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
unsigned InBits = InVT.getSizeInBits();
if (128 % InBits == 0) {
// 128 bit and smaller inputs should avoid truncate all together and
// just use a build_vector that will become a shuffle.
// TODO: Widen and use a shuffle directly?
MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
EVT EltVT = VT.getVectorElementType();
unsigned WidenNumElts = WidenVT.getVectorNumElements();
SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
// Use the original element count so we don't do more scalar opts than
// necessary.
unsigned MinElts = VT.getVectorNumElements();
for (unsigned i=0; i < MinElts; ++i) {
SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
DAG.getIntPtrConstant(i, dl));
Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
// With AVX512 there are some cases that can use a target specific
// truncate node to go from 256/512 to less than 128 with zeros in the
// upper elements of the 128 bit result.
if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
// We can use VTRUNC directly if for 256 bits with VLX or for any 512.
if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
// There's one case we can widen to 512 bits and use VTRUNC.
if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
isTypeLegal(MVT::v4i64)) {
// Input needs to be split and output needs to widened. Let's use two
// VTRUNCs, and shuffle their results together into the wider type.
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
{ 0, 1, 2, 3, 16, 17, 18, 19,
-1, -1, -1, -1, -1, -1, -1, -1 });
// Right now, only MVT::v8i8 has Custom action for an illegal type.
// It's intended to custom handle the input type.
assert(N->getValueType(0) == MVT::v8i8 &&
"Do not know how to legalize this Node");
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
(InVT == MVT::v4i16 || InVT == MVT::v4i8)){
assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!");
assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
// we allow the sra from the extend to i32 to be shared by the split.
In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
// Fill a vector with sign bits for each element.
SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
// Create an unpackl and unpackh to interleave the sign bits then bitcast
// to v2i64.
SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
{0, 4, 1, 5});
Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
{2, 6, 3, 7});
Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
if (VT == MVT::v16i32 || VT == MVT::v8i64) {
if (!InVT.is128BitVector()) {
// Not a 128 bit vector, but maybe type legalization will promote
// it to 128 bits.
if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
if (!InVT.is128BitVector())
// Promote the input to 128 bits. Type legalization will turn this into
// zext_inreg/sext_inreg.
In = DAG.getNode(N->getOpcode(), dl, InVT, In);
// Perform custom splitting instead of the two stage extend we would get
// by default.
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
assert(isTypeLegal(LoVT) && "Split VT not legal?");
SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
// We need to shift the input over by half the number of elements.
unsigned NumElts = InVT.getVectorNumElements();
unsigned HalfNumElts = NumElts / 2;
SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
for (unsigned i = 0; i != HalfNumElts; ++i)
ShufMask[i] = i + HalfNumElts;
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
bool IsStrict = N->isStrictFPOpcode();
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
N->getOpcode() == ISD::STRICT_FP_TO_SINT;
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
// Try to create a 128 bit vector, but don't exceed a 32 bit element.
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
SDValue Res;
SDValue Chain;
if (IsStrict) {
Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
{N->getOperand(0), Src});
Chain = Res.getValue(1);
} else
Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
// Preserve what we know about the size of the original result. Except
// when the result is v2i32 since we can't widen the assert.
if (PromoteVT != MVT::v2i32)
Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
dl, PromoteVT, Res,
// Truncate back to the original width.
Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
// Now widen to 128 bits.
unsigned NumConcats = 128 / VT.getSizeInBits();
MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
VT.getVectorNumElements() * NumConcats);
SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
ConcatOps[0] = Res;
Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
if (IsStrict)
if (VT == MVT::v2i32) {
assert((IsSigned || Subtarget.hasAVX512()) &&
"Can only handle signed conversion without AVX512");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
if (Src.getValueType() == MVT::v2f64) {
unsigned Opc;
if (IsStrict)
Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
// If we have VLX we can emit a target specific FP_TO_UINT node,.
if (!IsSigned && !Subtarget.hasVLX()) {
// Otherwise we can defer to the generic legalizer which will widen
// the input as well. This will be further widened during op
// legalization to v8i32<-v8f64.
// For strict nodes we'll need to widen ourselves.
// FIXME: Fix the type legalizer to safely widen strict nodes?
if (!IsStrict)
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
DAG.getConstantFP(0.0, dl, MVT::v2f64));
Opc = N->getOpcode();
SDValue Res;
SDValue Chain;
if (IsStrict) {
Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
{N->getOperand(0), Src});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
if (IsStrict)
// Custom widen strict v2f32->v2i32 by padding with zeros.
// FIXME: Should generic type legalizer do this?
if (Src.getValueType() == MVT::v2f32 && IsStrict) {
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getConstantFP(0.0, dl, MVT::v2f32));
SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
{N->getOperand(0), Src});
// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
// so early out here.
assert(!VT.isVector() && "Vectors should have been handled above!");
if (Subtarget.hasDQI() && VT == MVT::i64 &&
(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
assert(!Subtarget.is64Bit() && "i64 should be legal");
unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
// If we use a 128-bit result we might need to use a target specific node.
unsigned SrcElts =
std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
unsigned Opc = N->getOpcode();
if (NumElts != SrcElts) {
if (IsStrict)
Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
DAG.getConstantFP(0.0, dl, VecInVT), Src,
SDValue Chain;
if (IsStrict) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
Chain = Res.getValue(1);
} else
Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
if (IsStrict)
SDValue Chain;
if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
if (IsStrict)
case ISD::LRINT:
case ISD::LLRINT: {
if (SDValue V = LRINT_LLRINTHelper(N, DAG))
bool IsStrict = N->isStrictFPOpcode();
bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
N->getOpcode() == ISD::STRICT_SINT_TO_FP;
EVT VT = N->getValueType(0);
if (VT != MVT::v2f32)
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
if (IsStrict) {
unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
{N->getOperand(0), Src});
} else {
unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
SDValue Zero = DAG.getConstant(0, dl, SrcVT);
SDValue One = DAG.getConstant(1, dl, SrcVT);
SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
for (int i = 0; i != 2; ++i) {
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
SignSrc, DAG.getIntPtrConstant(i, dl));
if (IsStrict)
SignCvts[i] =
DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
{N->getOperand(0), Elt});
SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
SDValue Slow, Chain;
if (IsStrict) {
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
SignCvts[0].getValue(1), SignCvts[1].getValue(1));
Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
{Chain, SignCvt, SignCvt});
Chain = Slow.getValue(1);
} else {
Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
IsNeg =
DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
if (IsStrict)
if (SrcVT != MVT::v2i32)
if (IsSigned || Subtarget.hasAVX512()) {
if (!IsStrict)
// Custom widen strict v2i32->v2f32 to avoid scalarization.
// FIXME: Should generic type legalizer do this?
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
DAG.getConstant(0, dl, MVT::v2i32));
SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
{N->getOperand(0), Src});
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
SDValue VBias =
DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
DAG.getBitcast(MVT::v2i64, VBias));
Or = DAG.getBitcast(MVT::v2f64, Or);
if (IsStrict) {
SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
{N->getOperand(0), Or, VBias});
SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
{MVT::v4f32, MVT::Other},
{Sub.getValue(1), Sub});
} else {
// TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
case ISD::FP_ROUND: {
bool IsStrict = N->isStrictFPOpcode();
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
if (!isTypeLegal(Src.getValueType()))
SDValue V;
if (IsStrict)
V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
{N->getOperand(0), N->getOperand(1)});
V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
if (IsStrict)
// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
// No other ValueType for FP_EXTEND should reach this point.
assert(N->getValueType(0) == MVT::v2f32 &&
"Do not know how to legalize this Node");
unsigned IntNo = N->getConstantOperandVal(1);
switch (IntNo) {
default : llvm_unreachable("Do not know how to custom type "
"legalize this intrinsic operation!");
case Intrinsic::x86_rdtsc:
return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
case Intrinsic::x86_rdtscp:
return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
case Intrinsic::x86_rdpmc:
expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
case Intrinsic::x86_xgetbv:
expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
EVT T = N->getValueType(0);
assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
bool Regs64bit = T == MVT::i128;
assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&
MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
DAG.getConstant(0, dl, HalfT));
cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
DAG.getConstant(1, dl, HalfT));
cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
cpInL, SDValue());
cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
Regs64bit ? X86::RDX : X86::EDX,
cpInH, cpInL.getValue(1));
SDValue swapInL, swapInH;
swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
DAG.getConstant(0, dl, HalfT));
swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
DAG.getConstant(1, dl, HalfT));
swapInH =
DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
swapInH, cpInH.getValue(1));
// If the current function needs the base pointer, RBX,
// we shouldn't use cmpxchg directly.
// Indeed the lowering of that instruction will clobber
// that register and since RBX will be a reserved register
// the register allocator will not make sure its value will
// be properly saved and restored around this live-range.
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
SDValue Result;
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
Register BasePtr = TRI->getBaseRegister();
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
(BasePtr == X86::RBX || BasePtr == X86::EBX)) {
// ISel prefers the LCMPXCHG64 variant.
// If that assert breaks, that means it is not the case anymore,
// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
// not just EBX. This is a matter of accepting i64 input for that
// pseudo, and restoring into the register of the right wide
// in expand pseudo. Everything else should just work.
assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
"Saving only half of the RBX");
unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
Regs64bit ? X86::RBX : X86::EBX,
HalfT, swapInH.getValue(1));
SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
/*Glue*/ RBXSave.getValue(2)};
Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
} else {
unsigned Opcode =
swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
Regs64bit ? X86::RBX : X86::EBX, swapInL,
SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
Regs64bit ? X86::RDX : X86::EDX,
HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
MVT::i32, cpOutH.getValue(2));
SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
bool NoImplicitFloatOps =
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N);
if (Subtarget.hasSSE1()) {
// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
// Then extract the lower 64-bits.
MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Node->getMemOperand());
if (Subtarget.hasSSE2()) {
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
DAG.getIntPtrConstant(0, dl));
// We use an alternative sequence for SSE1 that extracts as v2f32 and
// then casts to i64. This avoids a 128-bit stack temporary being
// created by type legalization if we were to cast v4f32->v2i64.
SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
DAG.getIntPtrConstant(0, dl));
Res = DAG.getBitcast(MVT::i64, Res);
if (Subtarget.hasX87()) {
// First load this into an 80-bit X87 register. This will put the whole
// integer into the significand.
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
dl, Tys, Ops, MVT::i64,
SDValue Chain = Result.getValue(1);
// Now store the X87 register to a stack temporary and convert to i64.
// This store is not atomic and doesn't need to be.
// FIXME: We don't need a stack temporary if the result of the load
// is already being stored. We could just directly store there.
SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
SDValue StoreOps[] = { Chain, Result, StackPtr };
Chain = DAG.getMemIntrinsicNode(
X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
MPI, None /*Align*/, MachineMemOperand::MOStore);
// Finally load the value back from the stack temporary and return it.
// This load is not atomic and doesn't need to be.
// This load will be further type legalized.
Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
// TODO: Use MOVLPS when SSE1 is available?
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
case ISD::BITCAST: {
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT DstVT = N->getValueType(0);
EVT SrcVT = N->getOperand(0).getValueType();
// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
// we can split using the k-register rather than memory.
if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
Lo = DAG.getBitcast(MVT::i32, Lo);
Hi = DAG.getBitcast(MVT::i32, Hi);
SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
// FIXME: Use v4f32 for SSE1?
assert(Subtarget.hasSSE2() && "Requires SSE2");
assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!");
EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
Res = DAG.getBitcast(WideVT, Res);
case ISD::MGATHER: {
EVT VT = N->getValueType(0);
if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
auto *Gather = cast<MaskedGatherSDNode>(N);
SDValue Index = Gather->getIndex();
if (Index.getValueType() != MVT::v2i64)
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
SDValue Mask = Gather->getMask();
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
if (!Subtarget.hasVLX()) {
// We need to widen the mask, but the instruction will only use 2
// of its elements. So we can use undef.
Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
SDValue Res = DAG.getMemIntrinsicNode(
X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
Gather->getMemoryVT(), Gather->getMemOperand());
case ISD::LOAD: {
// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
// cast since type legalization will try to use an i64 load.
MVT VT = N->getSimpleValueType(0);
assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
if (!ISD::isNON_EXTLoad(N))
auto *Ld = cast<LoadSDNode>(N);
if (Subtarget.hasSSE2()) {
MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getOriginalAlign(),
SDValue Chain = Res.getValue(1);
MVT VecVT = MVT::getVectorVT(LdVT, 2);
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
Res = DAG.getBitcast(WideVT, Res);
assert(Subtarget.hasSSE1() && "Expected SSE");
SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Ld->getMemOperand());
SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((X86ISD::NodeType)Opcode) {
case X86ISD::FIRST_NUMBER: break;
#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
return nullptr;
/// Return true if the addressing mode represented by AM is legal for this
/// target, for a load/store of the specified type.
bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS,
Instruction *I) const {
// X86 supports extremely general addressing modes.
CodeModel::Model M = getTargetMachine().getCodeModel();
// X86 allows a sign-extended 32-bit immediate field as a displacement.
if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
return false;
if (AM.BaseGV) {
unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
// If a reference to this global requires an extra load, we can't fold it.
if (isGlobalStubReference(GVFlags))
return false;
// If BaseGV requires a register for the PIC base, we cannot also have a
// BaseReg specified.
if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
return false;
// If lower 4G is not available, then we must use rip-relative addressing.
if ((M != CodeModel::Small || isPositionIndependent()) &&
Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
return false;
switch (AM.Scale) {
case 0:
case 1:
case 2:
case 4:
case 8:
// These scales always work.
case 3:
case 5:
case 9:
// These scales are formed with basereg+scalereg. Only accept if there is
// no basereg yet.
if (AM.HasBaseReg)
return false;
default: // Other stuff never works.
return false;
return true;
bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
unsigned Bits = Ty->getScalarSizeInBits();
// 8-bit shifts are always expensive, but versions with a scalar amount aren't
// particularly cheaper than those without.
if (Bits == 8)
return false;
// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
if (Subtarget.hasXOP() &&
(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
return false;
// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
// shifts just as cheap as scalar ones.
if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
return false;
// AVX512BW has shifts such as vpsllvw.
if (Subtarget.hasBWI() && Bits == 16)
return false;
// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
// fully general vector.
return true;
bool X86TargetLowering::isBinOp(unsigned Opcode) const {
switch (Opcode) {
// These are non-commutative binops.
// TODO: Add more X86ISD opcodes once we have test coverage.
case X86ISD::ANDNP:
case X86ISD::PCMPGT:
case X86ISD::FMAX:
case X86ISD::FMIN:
case X86ISD::FANDN:
return true;
return TargetLoweringBase::isBinOp(Opcode);
bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
switch (Opcode) {
// TODO: Add more X86ISD opcodes once we have test coverage.
case X86ISD::PCMPEQ:
case X86ISD::PMULDQ:
case X86ISD::FMAXC:
case X86ISD::FMINC:
case X86ISD::FAND:
case X86ISD::FOR:
case X86ISD::FXOR:
return true;
return TargetLoweringBase::isCommutativeBinOp(Opcode);
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 > NumBits2;
bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
if (!isTypeLegal(EVT::getEVT(Ty1)))
return false;
assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
// Assuming the caller doesn't have a zeroext or signext return parameter,
// truncation all the way down to i1 is valid.
return true;
bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return isInt<32>(Imm);
bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
// Can also use sub to handle negated immediates.
return isInt<32>(Imm);
bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
return isInt<32>(Imm);
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 > NumBits2;
bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
EVT VT1 = Val.getValueType();
if (isZExtFree(VT1, VT2))
return true;
if (Val.getOpcode() != ISD::LOAD)
return false;
if (!VT1.isSimple() || !VT1.isInteger() ||
!VT2.isSimple() || !VT2.isInteger())
return false;
switch (VT1.getSimpleVT().SimpleTy) {
default: break;
case MVT::i8:
case MVT::i16:
case MVT::i32:
// X86 has 8, 16, and 32-bit zero-extending loads.
return true;
return false;
bool X86TargetLowering::shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
// A uniform shift amount in a vector shift or funnel shift may be much
// cheaper than a generic variable vector shift, so make that pattern visible
// to SDAG by sinking the shuffle instruction next to the shift.
int ShiftAmountOpNum = -1;
if (I->isShift())
ShiftAmountOpNum = 1;
else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
if (II->getIntrinsicID() == Intrinsic::fshl ||
II->getIntrinsicID() == Intrinsic::fshr)
ShiftAmountOpNum = 2;
if (ShiftAmountOpNum == -1)
return false;
auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
isVectorShiftByScalarCheap(I->getType())) {
return true;
return false;
bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
if (!Subtarget.is64Bit())
return false;
return TargetLowering::shouldConvertPhiType(From, To);
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
return false;
EVT SrcVT = ExtVal.getOperand(0).getValueType();
// There is no extending load for vXi1.
if (SrcVT.getScalarType() == MVT::i1)
return false;
return true;
bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const {
if (!Subtarget.hasAnyFMA())
return false;
VT = VT.getScalarType();
if (!VT.isSimple())
return false;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32:
case MVT::f64:
return true;
return false;
bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
// i16 instructions are longer (0x66 prefix) and potentially slower.
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
/// Targets can use this to indicate that they only support *some*
/// VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
/// are assumed to be legal.
bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
if (!VT.isSimple())
return false;
// Not for i1 vectors
if (VT.getSimpleVT().getScalarType() == MVT::i1)
return false;
// Very little shuffling can be done for 64-bit vectors right now.
if (VT.getSimpleVT().getSizeInBits() == 64)
return false;
// We only care that the types being shuffled are legal. The lowering can
// handle any possible shuffle mask that results.
return isTypeLegal(VT.getSimpleVT());
bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
EVT VT) const {
// Don't convert an 'and' into a shuffle that we don't directly support.
// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
if (!Subtarget.hasAVX2())
if (VT == MVT::v32i8 || VT == MVT::v16i16)
return false;
// Just delegate to the generic legality, clear masks aren't special.
return isShuffleMaskLegal(Mask, VT);
bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
// If the subtarget is using thunks, we need to not generate jump tables.
if (Subtarget.useIndirectThunkBranches())
return false;
// Otherwise, fallback on the generic logic.
return TargetLowering::areJTsAllowed(Fn);
// X86 Scheduler Hooks
+// Returns true if EFLAG is consumed after this iterator in the rest of the
+// basic block or any successors of the basic block.
+static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
+ MachineBasicBlock *BB) {
+ // Scan forward through BB for a use/def of EFLAGS.
+ for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
+ miI != miE; ++miI) {
+ const MachineInstr& mi = *miI;
+ if (mi.readsRegister(X86::EFLAGS))
+ return true;
+ // If we found a def, we can stop searching.
+ if (mi.definesRegister(X86::EFLAGS))
+ return false;
+ }
+ // If we hit the end of the block, check whether EFLAGS is live into a
+ // successor.
+ for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+ sEnd = BB->succ_end();
+ sItr != sEnd; ++sItr) {
+ MachineBasicBlock* succ = *sItr;
+ if (succ->isLiveIn(X86::EFLAGS))
+ return true;
+ }
+ return false;
/// Utility function to emit xbegin specifying the start of an RTM region.
static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
const TargetInstrInfo *TII) {
DebugLoc DL = MI.getDebugLoc();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
// For the v = xbegin(), we generate
// thisMBB:
// xbegin sinkMBB
// mainMBB:
// s0 = -1
// fallBB:
// eax = # XABORT_DEF
// s1 = eax
// sinkMBB:
// v = phi(s0/mainBB, s1/fallBB)
MachineBasicBlock *thisMBB = MBB;
MachineFunction *MF = MBB->getParent();
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, mainMBB);
MF->insert(I, fallMBB);
MF->insert(I, sinkMBB);
+ if (isEFLAGSLiveAfter(MI, MBB)) {
+ mainMBB->addLiveIn(X86::EFLAGS);
+ fallMBB->addLiveIn(X86::EFLAGS);
+ sinkMBB->addLiveIn(X86::EFLAGS);
+ }
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
MachineRegisterInfo &MRI = MF->getRegInfo();
Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
Register mainDstReg = MRI.createVirtualRegister(RC);
Register fallDstReg = MRI.createVirtualRegister(RC);
// thisMBB:
// xbegin fallMBB
// # fallthrough to mainMBB
// # abortion to fallMBB
BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
// mainMBB:
// mainDstReg := -1
BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
// fallMBB:
// ; pseudo instruction to model hardware's definition from XABORT
// fallDstReg := EAX
BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
// sinkMBB:
// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
return sinkMBB;
MachineBasicBlock *
X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
// 0 ) Output : destination address (reg)
// 1-5) Input : va_list address (addr, i64mem)
// 6 ) ArgSize : Size (in bytes) of vararg type
// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
// 8 ) Align : Alignment of type
// 9 ) EFLAGS (implicit-def)
assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
static_assert(X86::AddrNumOperands == 5,
"VAARG_64 assumes 5 address operands");
Register DestReg = MI.getOperand(0).getReg();
MachineOperand &Base = MI.getOperand(1);
MachineOperand &Scale = MI.getOperand(2);
MachineOperand &Index = MI.getOperand(3);
MachineOperand &Disp = MI.getOperand(4);
MachineOperand &Segment = MI.getOperand(5);
unsigned ArgSize = MI.getOperand(6).getImm();
unsigned ArgMode = MI.getOperand(7).getImm();
Align Alignment = Align(MI.getOperand(8).getImm());
MachineFunction *MF = MBB->getParent();
// Memory Reference
assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
MachineMemOperand *OldMMO = MI.memoperands().front();
// Clone the MMO into two separate MMOs for loading and storing
MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
// Machine Information
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
DebugLoc DL = MI.getDebugLoc();
// struct va_list {
// i32 gp_offset
// i32 fp_offset
// i64 overflow_area (address)
// i64 reg_save_area (address)
// }
// sizeof(va_list) = 24
// alignment(va_list) = 8
unsigned TotalNumIntRegs = 6;
unsigned TotalNumXMMRegs = 8;
bool UseGPOffset = (ArgMode == 1);
bool UseFPOffset = (ArgMode == 2);
unsigned MaxOffset = TotalNumIntRegs * 8 +
(UseFPOffset ? TotalNumXMMRegs * 16 : 0);
/* Align ArgSize to a multiple of 8 */
unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
bool NeedsAlign = (Alignment > 8);
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *overflowMBB;
MachineBasicBlock *offsetMBB;
MachineBasicBlock *endMBB;
unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
unsigned OffsetReg = 0;
if (!UseGPOffset && !UseFPOffset) {
// If we only pull from the overflow region, we don't create a branch.
// We don't need to alter control flow.
OffsetDestReg = 0; // unused
OverflowDestReg = DestReg;
offsetMBB = nullptr;
overflowMBB = thisMBB;
endMBB = thisMBB;
} else {
// First emit code to check if gp_offset (or fp_offset) is below the bound.
// If so, pull the argument from reg_save_area. (branch to offsetMBB)
// If not, pull from overflow_area. (branch to overflowMBB)
// thisMBB
// | .
// | .
// offsetMBB overflowMBB
// | .
// | .
// endMBB
// Registers for the PHI in endMBB
OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator MBBIter = ++MBB->getIterator();
// Insert the new basic blocks
MF->insert(MBBIter, offsetMBB);
MF->insert(MBBIter, overflowMBB);
MF->insert(MBBIter, endMBB);
// Transfer the remainder of MBB and its successor edges to endMBB.
endMBB->splice(endMBB->begin(), thisMBB,
std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
// Make offsetMBB and overflowMBB successors of thisMBB
// endMBB is a successor of both offsetMBB and overflowMBB
// Load the offset value into a register
OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
.addDisp(Disp, UseFPOffset ? 4 : 0)
// Check if there is enough room left to pull this argument.
BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
.addImm(MaxOffset + 8 - ArgSizeA8);
// Branch to "overflowMBB" if offset >= max
// Fall through to "offsetMBB" otherwise
BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
// In offsetMBB, emit code to use the reg_save_area.
if (offsetMBB) {
assert(OffsetReg != 0);
// Read the reg_save_area address.
Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
.addDisp(Disp, 16)
// Zero-extend the offset
Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
// Add the offset to the reg_save_area to get the final address.
BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
// Compute the offset for the next argument
Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
.addImm(UseFPOffset ? 16 : 8);
// Store it back into the va_list.
BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
.addDisp(Disp, UseFPOffset ? 4 : 0)
// Jump to endMBB
BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
// Emit code to use overflow area
// Load the overflow_area address into a register.
Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
.addDisp(Disp, 8)
// If we need to align it, do so. Otherwise, just copy the address
// to OverflowDestReg.
if (NeedsAlign) {
// Align the overflow address
Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
.addImm(Alignment.value() - 1);
BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
.addImm(~(uint64_t)(Alignment.value() - 1));
} else {
BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
// Compute the next overflow address after this argument.
// (the overflow address should be kept 8-byte aligned)
Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
// Store the new overflow address.
BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
.addDisp(Disp, 8)
// If we branched, emit the PHI to the front of endMBB.
if (offsetMBB) {
BuildMI(*endMBB, endMBB->begin(), DL,
TII->get(X86::PHI), DestReg)
// Erase the pseudo instruction
return endMBB;
MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *MBB) const {
// Emit code to save XMM registers to the stack. The ABI says that the
// number of registers to save is given in %al, so it's theoretically
// possible to do an indirect jump trick to avoid saving all of them,
// however this code takes a simpler approach and just executes all
// of the stores if %al is non-zero. It's less code, and it's probably
// easier on the hardware branch predictor, and stores aren't all that
// expensive anyway.
// Create the new basic blocks. One block contains all the XMM stores,
// and one block is the final destination regardless of whether any
// stores were performed.
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
MachineFunction *F = MBB->getParent();
MachineFunction::iterator MBBIter = ++MBB->getIterator();
MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(MBBIter, XMMSaveMBB);
F->insert(MBBIter, EndMBB);
// Transfer the remainder of MBB and its successor edges to EndMBB.
EndMBB->splice(EndMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
// The original block will now fall through to the XMM save block.
// The XMMSaveMBB will fall through to the end block.
// Now add the instructions.
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
Register CountReg = MI.getOperand(0).getReg();
int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
// Make sure the last operand is EFLAGS, which gets clobbered by the branch
// that was just emitted, but clearly shouldn't be "saved".
assert((MI.getNumOperands() <= 3 ||
!MI.getOperand(MI.getNumOperands() - 1).isReg() ||
MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
"Expected last argument to be EFLAGS");
unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
// In the XMM save block, save all the XMM argument registers.
for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
MachineMemOperand *MMO = F->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
/*Size=*/16, Align(16));
BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
MI.eraseFromParent(); // The pseudo instruction is gone now.
return EndMBB;
// The EFLAGS operand of SelectItr might be missing a kill marker
// because there were multiple uses of EFLAGS, and ISel didn't know
// which to mark. Figure out whether SelectItr should have had a
// kill marker, and set it if it should. Returns the correct kill
// marker value.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
MachineBasicBlock* BB,
const TargetRegisterInfo* TRI) {
- // Scan forward through BB for a use/def of EFLAGS.
- MachineBasicBlock::iterator miI(std::next(SelectItr));
- for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
- const MachineInstr& mi = *miI;
- if (mi.readsRegister(X86::EFLAGS))
- return false;
- if (mi.definesRegister(X86::EFLAGS))
- break; // Should have kill-flag - update below.
- }
- // If we hit the end of the block, check whether EFLAGS is live into a
- // successor.
- if (miI == BB->end()) {
- for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
- sEnd = BB->succ_end();
- sItr != sEnd; ++sItr) {
- MachineBasicBlock* succ = *sItr;
- if (succ->isLiveIn(X86::EFLAGS))
- return false;
- }
- }
+ if (isEFLAGSLiveAfter(SelectItr, BB))
+ return false;
// We found a def, or hit the end of the basic block and EFLAGS wasn't live
// out. SelectMI should have a kill flag on EFLAGS.
SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
return true;
// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
// together with other CMOV pseudo-opcodes into a single basic-block with
// conditional jump around it.
static bool isCMOVPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
case X86::CMOV_FR32:
case X86::CMOV_FR32X:
case X86::CMOV_FR64:
case X86::CMOV_FR64X:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
case X86::CMOV_VR64:
case X86::CMOV_VR128:
case X86::CMOV_VR128X:
case X86::CMOV_VR256:
case X86::CMOV_VR256X:
case X86::CMOV_VR512:
case X86::CMOV_VK1:
case X86::CMOV_VK2:
case X86::CMOV_VK4:
case X86::CMOV_VK8:
case X86::CMOV_VK16:
case X86::CMOV_VK32:
case X86::CMOV_VK64:
return true;
return false;
// Helper function, which inserts PHI functions into SinkMBB:
// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
// the last PHI function inserted.
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
MachineBasicBlock *SinkMBB) {
MachineFunction *MF = TrueMBB->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
DebugLoc DL = MIItBegin->getDebugLoc();
X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
// As we are creating the PHIs, we have to be careful if there is more than
// one. Later CMOVs may reference the results of earlier CMOVs, but later
// PHIs have to reference the individual true/false inputs from earlier PHIs.
// That also means that PHI construction must work forward from earlier to
// later, and that the code must maintain a mapping from earlier PHI's
// destination registers, and the registers that went into the PHI.
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
MachineInstrBuilder MIB;
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
Register DestReg = MIIt->getOperand(0).getReg();
Register Op1Reg = MIIt->getOperand(1).getReg();
Register Op2Reg = MIIt->getOperand(2).getReg();
// If this CMOV we are generating is the opposite condition from
// the jump we generated, then we have to swap the operands for the
// PHI that is going to be generated.
if (MIIt->getOperand(3).getImm() == OppCC)
std::swap(Op1Reg, Op2Reg);
if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
Op1Reg = RegRewriteTable[Op1Reg].first;
if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
Op2Reg = RegRewriteTable[Op2Reg].second;
MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
// Add this PHI to the rewrite table.
RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
return MIB;
// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
MachineBasicBlock *
X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
MachineInstr &SecondCascadedCMOV,
MachineBasicBlock *ThisMBB) const {
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = FirstCMOV.getDebugLoc();
// We lower cascaded CMOVs such as
// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
// to two successive branches.
// Without this, we would add a PHI between the two jumps, which ends up
// creating a few copies all around. For instance, for
// (sitofp (zext (fcmp une)))
// we would generate:
// ucomiss %xmm1, %xmm0
// movss <1.0f>, %xmm0
// movaps %xmm0, %xmm1
// jne .LBB5_2
// xorps %xmm1, %xmm1
// .LBB5_2:
// jp .LBB5_4
// movaps %xmm1, %xmm0
// .LBB5_4:
// retq
// because this custom-inserter would have generated:
// A
// | \
// | B
// | /
// C
// | \
// | D
// | /
// E
// A: X = ...; Y = ...
// B: empty
// C: Z = PHI [X, A], [Y, B]
// D: empty
// E: PHI [X, C], [Z, D]
// If we lower both CMOVs in a single step, we can instead generate:
// A
// | \
// | C
// | /|
// |/ |
// | |
// | D
// | /
// E
// A: X = ...; Y = ...
// D: empty
// E: PHI [X, A], [X, C], [Y, D]
// Which, in our sitofp/fcmp example, gives us something like:
// ucomiss %xmm1, %xmm0
// movss <1.0f>, %xmm0
// jne .LBB5_4
// jp .LBB5_4
// xorps %xmm0, %xmm0
// .LBB5_4:
// retq
// We lower cascaded CMOV into two successive branches to the same block.
// EFLAGS is used by both, so mark it as live in the second.
const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
MachineFunction *F = ThisMBB->getParent();
MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator It = ++ThisMBB->getIterator();
F->insert(It, FirstInsertedMBB);
F->insert(It, SecondInsertedMBB);
F->insert(It, SinkMBB);
// For a cascaded CMOV, we lower it to two successive branches to
// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
// the FirstInsertedMBB.
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
SinkMBB->splice(SinkMBB->begin(), ThisMBB,
// Fallthrough block for ThisMBB.
// The true block target of the first branch is always SinkMBB.
// Fallthrough block for FirstInsertedMBB.
// The true block for the branch of FirstInsertedMBB.
// This is fallthrough.
// Create the conditional branch instructions.
X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
X86::CondCode SecondCC =
BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
// SinkMBB:
// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
Register DestReg = FirstCMOV.getOperand(0).getReg();
Register Op1Reg = FirstCMOV.getOperand(1).getReg();
Register Op2Reg = FirstCMOV.getOperand(2).getReg();
MachineInstrBuilder MIB =
BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
// The second SecondInsertedMBB provides the same incoming value as the
// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
// Copy the PHI result to the register defined by the second CMOV.
BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
// Now remove the CMOVs.
return SinkMBB;
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
MachineBasicBlock *ThisMBB) const {
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// diamond control-flow pattern. The incoming instruction knows the
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between and a branch opcode to use.
// ThisMBB:
// ...
// TrueVal = ...
// cmpTY ccX, r1, r2
// bCC copy1MBB
// fallthrough --> FalseMBB
// This code lowers all pseudo-CMOV instructions. Generally it lowers these
// as described above, by inserting a BB, and then making a PHI at the join
// point to select the true and false operands of the CMOV in the PHI.
// The code also handles two different cases of multiple CMOV opcodes
// in a row.
// Case 1:
// In this case, there are multiple CMOVs in a row, all which are based on
// the same condition setting (or the exact opposite condition setting).
// In this case we can lower all the CMOVs using a single inserted BB, and
// then make a number of PHIs at the join point to model the CMOVs. The only
// trickiness here, is that in a case like:
// t2 = CMOV cond1 t1, f1
// t3 = CMOV cond1 t2, f2
// when rewriting this into PHIs, we have to perform some renaming on the
// temps since you cannot have a PHI operand refer to a PHI result earlier
// in the same block. The "simple" but wrong lowering would be:
// t2 = PHI t1(BB1), f1(BB2)
// t3 = PHI t2(BB1), f2(BB2)
// but clearly t2 is not defined in BB1, so that is incorrect. The proper
// renaming is to note that on the path through BB1, t2 is really just a
// copy of t1, and do that renaming, properly generating:
// t2 = PHI t1(BB1), f1(BB2)
// t3 = PHI t1(BB1), f2(BB2)
// Case 2:
// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
// function - EmitLoweredCascadedSelect.
X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
MachineInstr *LastCMOV = &MI;
MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
// Check for case 1, where there are multiple CMOVs with the same condition
// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
// number of jumps the most.
if (isCMOVPseudo(MI)) {
// See if we have a string of CMOVS with the same condition. Skip over
// intervening debug insts.
while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
// This checks for case 2, but only do this if we didn't already find
// case 1, as indicated by LastCMOV == MI.
if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
NextMIIt->getOpcode() == MI.getOpcode() &&
NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
NextMIIt->getOperand(1).isKill()) {
return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
MachineFunction *F = ThisMBB->getParent();
MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator It = ++ThisMBB->getIterator();
F->insert(It, FalseMBB);
F->insert(It, SinkMBB);
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
if (!LastCMOV->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
// Transfer any debug instructions inside the CMOV sequence to the sunk block.
auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
auto DbgIt = MachineBasicBlock::iterator(MI);
while (DbgIt != DbgEnd) {
auto Next = std::next(DbgIt);
if (DbgIt->isDebugInstr())
DbgIt = Next;
// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
SinkMBB->splice(SinkMBB->end(), ThisMBB,
// Fallthrough block for ThisMBB.
// The true block target of the first (or only) branch is always a SinkMBB.
// Fallthrough block for FalseMBB.
// Create the conditional branch instruction.
BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
// SinkMBB:
// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
// ...
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
MachineBasicBlock::iterator MIItEnd =
createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
// Now remove the CMOV(s).
ThisMBB->erase(MIItBegin, MIItEnd);
return SinkMBB;
static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
if (IsLP64) {
if (isInt<8>(Imm))
return X86::SUB64ri8;
return X86::SUB64ri32;
} else {
if (isInt<8>(Imm))
return X86::SUB32ri8;
return X86::SUB32ri;
MachineBasicBlock *
X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
MachineBasicBlock *MBB) const {
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
DebugLoc DL = MI.getDebugLoc();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
const unsigned ProbeSize = getStackProbeSize(*MF);
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineFunction::iterator MBBIter = ++MBB->getIterator();
MF->insert(MBBIter, testMBB);
MF->insert(MBBIter, blockMBB);
MF->insert(MBBIter, tailMBB);
Register sizeVReg = MI.getOperand(1).getReg();
Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
Register TmpStackPtr = MRI.createVirtualRegister(
TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
Register FinalStackPtr = MRI.createVirtualRegister(
TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
// test rsp size
BuildMI(testMBB, DL,
TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
BuildMI(testMBB, DL, TII->get(X86::JCC_1))
// Touch the block then extend it. This is done on the opposite side of
// static probe where we allocate then touch, to avoid the need of probing the
// tail of the static alloca. Possible scenarios are:
// + ---- <- ------------ <- ------------- <- ------------ +
// | |
// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
// | |
// + <- ----------- <- ------------ <- ----------- <- ------------ +
// The property we want to enforce is to never have more than [page alloc] between two probes.
const unsigned MovMIOpc =
TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
BuildMI(blockMBB, DL,
TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
// Replace original instruction by the expected stack ptr
BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
tailMBB->splice(tailMBB->end(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
// Delete the original pseudo instruction.
// And we're done.
return tailMBB;
MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
const bool Is64Bit = Subtarget.is64Bit();
const bool IsLP64 = Subtarget.isTarget64BitLP64();
const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
// BB:
// ... [Till the alloca]
// If stacklet is not large enough, jump to mallocMBB
// bumpMBB:
// Allocate by subtracting from RSP
// Jump to continueMBB
// mallocMBB:
// Allocate by call to runtime
// continueMBB:
// ...
// [rest of original BB]
MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *AddrRegClass =
unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
sizeVReg = MI.getOperand(1).getReg(),
physSPReg =
IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
MachineFunction::iterator MBBIter = ++BB->getIterator();
MF->insert(MBBIter, bumpMBB);
MF->insert(MBBIter, mallocMBB);
MF->insert(MBBIter, continueMBB);
continueMBB->splice(continueMBB->begin(), BB,
std::next(MachineBasicBlock::iterator(MI)), BB->end());
// Add code to the main basic block to check if the stack limit has been hit,
// and if so, jump to mallocMBB otherwise to bumpMBB.
BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
const uint32_t *RegMask =
Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
.addReg(X86::RDI, RegState::Implicit)
.addReg(X86::RAX, RegState::ImplicitDefine);
} else if (Is64Bit) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
.addReg(X86::EDI, RegState::Implicit)
.addReg(X86::EAX, RegState::ImplicitDefine);
} else {
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
.addReg(X86::EAX, RegState::ImplicitDefine);
if (!Is64Bit)
BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
.addReg(IsLP64 ? X86::RAX : X86::EAX);
BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Set up the CFG correctly.
// Take care of the PHI nodes.
BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
// Delete the original pseudo instruction.
// And we're done.
return continueMBB;
MachineBasicBlock *
X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
DebugLoc DL = MI.getDebugLoc();
classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
"SEH does not use catchret!");
// Only 32-bit EH needs to worry about manually restoring stack pointers.
if (!Subtarget.is32Bit())
return BB;
// C++ EH creates a new target block to hold the restore code, and wires up
// the new block to the return destination with a normal JMP_4.
MachineBasicBlock *RestoreMBB =
assert(BB->succ_size() == 1);
MF->insert(std::next(BB->getIterator()), RestoreMBB);
// Marking this as an EH pad but not a funclet entry block causes PEI to
// restore stack pointers in the block.
auto RestoreMBBI = RestoreMBB->begin();
BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
return BB;
MachineBasicBlock *
X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
MachineBasicBlock *BB) const {
// So, here we replace TLSADDR with the sequence:
// adjust_stackdown -> TLSADDR -> adjust_stackup.
// We need this because TLSADDR is lowered into calls
// inside MC, therefore without the two markers shrink-wrapping
// may push the prologue/epilogue pass them.
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
MachineFunction &MF = *BB->getParent();
// Emit CALLSEQ_START right before the instruction.
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
MachineInstrBuilder CallseqStart =
BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
// Emit CALLSEQ_END right after the instruction.
// We don't call erase from parent because we want to keep the
// original instruction around.
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
MachineInstrBuilder CallseqEnd =
BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
return BB;
MachineBasicBlock *
X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
MachineBasicBlock *BB) const {
// This is pretty easy. We're taking the value that we received from
// our load from the relocation, sticking it in either RDI (x86-64)
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
MachineFunction *F = BB->getParent();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
assert(MI.getOperand(3).isGlobal() && "This should be a global");
// Get a register mask for the lowered call.
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
const uint32_t *RegMask =
Subtarget.is64Bit() ?
Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
if (Subtarget.is64Bit()) {
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
addDirectMem(MIB, X86::RDI);
MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
} else if (!isPositionIndependent()) {
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
addDirectMem(MIB, X86::EAX);
MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
} else {
MachineInstrBuilder MIB =
BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
addDirectMem(MIB, X86::EAX);
MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
switch (RPOpc) {
return X86::CALLpcrel32;
return X86::CALL64pcrel32;
return X86::TCRETURNdi;
return X86::TCRETURNdi64;
llvm_unreachable("not indirect thunk opcode");
static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
unsigned Reg) {
if (Subtarget.useRetpolineExternalThunk()) {
// When using an external thunk for retpolines, we pick names that match the
// names GCC happens to use as well. This helps simplify the implementation
// of the thunks for kernels where they have no easy ability to create
// aliases and are doing non-trivial configuration of the thunk's body. For
// example, the Linux kernel will do boot-time hot patching of the thunk
// bodies and cannot easily export aliases of these to loaded modules.
// Note that at any point in the future, we may need to change the semantics
// of how we implement retpolines and at that time will likely change the
// name of the called thunk. Essentially, there is no hard guarantee that
// LLVM will generate calls to specific thunks, we merely make a best-effort
// attempt to help out kernels and other systems where duplicating the
// thunks is costly.
switch (Reg) {
case X86::EAX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_eax";
case X86::ECX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_ecx";
case X86::EDX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_edx";
case X86::EDI:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__x86_indirect_thunk_edi";
case X86::R11:
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__x86_indirect_thunk_r11";
llvm_unreachable("unexpected reg for external indirect thunk");
if (Subtarget.useRetpolineIndirectCalls() ||
Subtarget.useRetpolineIndirectBranches()) {
// When targeting an internal COMDAT thunk use an LLVM-specific name.
switch (Reg) {
case X86::EAX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_eax";
case X86::ECX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_ecx";
case X86::EDX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edx";
case X86::EDI:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edi";
case X86::R11:
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__llvm_retpoline_r11";
llvm_unreachable("unexpected reg for retpoline");
if (Subtarget.useLVIControlFlowIntegrity()) {
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__llvm_lvi_thunk_r11";
llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
MachineBasicBlock *
X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
MachineBasicBlock *BB) const {
// Copy the virtual register into the R11 physical register and
// call the retpoline thunk.
DebugLoc DL = MI.getDebugLoc();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
Register CalleeVReg = MI.getOperand(0).getReg();
unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
// Find an available scratch register to hold the callee. On 64-bit, we can
// just use R11, but we scan for uses anyway to ensure we don't generate
// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
// already a register use operand to the call to hold the callee. If none
// are available, use EDI instead. EDI is chosen because EBX is the PIC base
// register and ESI is the base pointer to realigned stack frames with VLAs.
SmallVector<unsigned, 3> AvailableRegs;
if (Subtarget.is64Bit())
AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
// Zero out any registers that are already used.
for (const auto &MO : MI.operands()) {
if (MO.isReg() && MO.isUse())
for (unsigned &Reg : AvailableRegs)
if (Reg == MO.getReg())
Reg = 0;
// Choose the first remaining non-zero available register.
unsigned AvailableReg = 0;
for (unsigned MaybeReg : AvailableRegs) {
if (MaybeReg) {
AvailableReg = MaybeReg;
if (!AvailableReg)
report_fatal_error("calling convention incompatible with retpoline, no "
"available registers");
const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
MachineInstrBuilder(*BB->getParent(), &MI)
.addReg(AvailableReg, RegState::Implicit | RegState::Kill);
return BB;
/// SetJmp implies future control flow change upon calling the corresponding
/// LongJmp.
/// Instead of using the 'return' instruction, the long jump fixes the stack and
/// performs an indirect branch. To do so it uses the registers that were stored
/// in the jump buffer (when calling SetJmp).
/// In case the shadow stack is enabled we need to fix it as well, because some
/// return addresses will be skipped.
/// The function will save the SSP for future fixing in the function
/// emitLongJmpShadowStackFix.
/// \sa emitLongJmpShadowStackFix
/// \param [in] MI The temporary Machine Instruction for the builtin.
/// \param [in] MBB The Machine Basic Block that will be modified.
void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineInstrBuilder MIB;
// Memory Reference.
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
// Initialize a register with zero.
MVT PVT = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
Register ZReg = MRI.createVirtualRegister(PtrRC);
unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
.addReg(ZReg, RegState::Undef)
.addReg(ZReg, RegState::Undef);
// Read the current SSP Register value to the zeroed register.
Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
// Write the SSP register value to offset 3 in input memory buffer.
unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
const int64_t SSPOffset = 3 * PVT.getStoreSize();
const unsigned MemOpndSlot = 1;
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
MIB.add(MI.getOperand(MemOpndSlot + i));
MachineBasicBlock *
X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
// Memory Reference
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
unsigned DstReg;
unsigned MemOpndSlot = 0;
unsigned CurOp = 0;
DstReg = MI.getOperand(CurOp++).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
Register mainDstReg = MRI.createVirtualRegister(RC);
Register restoreDstReg = MRI.createVirtualRegister(RC);
MemOpndSlot = CurOp;
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
"Invalid Pointer Size!");
// For v = setjmp(buf), we generate
// thisMBB:
// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
// SjLjSetup restoreMBB
// mainMBB:
// v_main = 0
// sinkMBB:
// v = phi(main, restore)
// restoreMBB:
// if base pointer being used, load it from frame
// v_restore = 1
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, mainMBB);
MF->insert(I, sinkMBB);
MachineInstrBuilder MIB;
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
// thisMBB:
unsigned PtrStoreOpc = 0;
unsigned LabelReg = 0;
const int64_t LabelOffset = 1 * PVT.getStoreSize();
bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
// Prepare IP either in reg or imm.
if (!UseImmLabel) {
PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
LabelReg = MRI.createVirtualRegister(PtrRC);
if (Subtarget.is64Bit()) {
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
} else {
const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
} else
PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
// Store IP
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
MIB.add(MI.getOperand(MemOpndSlot + i));
if (!UseImmLabel)
if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
emitSetJmpShadowStackFix(MI, thisMBB);
// Setup
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
// mainMBB:
// EAX = 0
BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
// sinkMBB:
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
TII->get(X86::PHI), DstReg)
// restoreMBB:
if (RegInfo->hasBasePointer(*MF)) {
const bool Uses64BitFramePtr =
Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
Register FramePtr = RegInfo->getFrameRegister(*MF);
Register BasePtr = RegInfo->getBaseRegister();
unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
FramePtr, true, X86FI->getRestoreBasePointerOffset())
BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
return sinkMBB;
/// Fix the shadow stack using the previously saved SSP pointer.
/// \sa emitSetJmpShadowStackFix
/// \param [in] MI The temporary Machine Instruction for the builtin.
/// \param [in] MBB The Machine Basic Block that will be modified.
/// \return The sink MBB that will perform the future indirect branch.
MachineBasicBlock *
X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
MVT PVT = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
// checkSspMBB:
// xor vreg1, vreg1
// rdssp vreg1
// test vreg1, vreg1
// je sinkMBB # Jump if Shadow Stack is not supported
// fallMBB:
// mov buf+24/12(%rip), vreg2
// sub vreg1, vreg2
// jbe sinkMBB # No need to fix the Shadow Stack
// fixShadowMBB:
// shr 3/2, vreg2
// incssp vreg2 # fix the SSP according to the lower 8 bits
// shr 8, vreg2
// je sinkMBB
// fixShadowLoopPrepareMBB:
// shl vreg2
// mov 128, vreg3
// fixShadowLoopMBB:
// incssp vreg3
// dec vreg2
// jne fixShadowLoopMBB # Iterate until you finish fixing
// # the Shadow Stack
// sinkMBB:
MachineFunction::iterator I = ++MBB->getIterator();
const BasicBlock *BB = MBB->getBasicBlock();
MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, checkSspMBB);
MF->insert(I, fallMBB);
MF->insert(I, fixShadowMBB);
MF->insert(I, fixShadowLoopPrepareMBB);
MF->insert(I, fixShadowLoopMBB);
MF->insert(I, sinkMBB);
// Transfer the remainder of BB and its successor edges to sinkMBB.
sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
// Initialize a register with zero.
Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
if (PVT == MVT::i64) {
Register TmpZReg = MRI.createVirtualRegister(PtrRC);
BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
ZReg = TmpZReg;
// Read the current SSP Register value to the zeroed register.
Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
// Check whether the result of the SSP register is zero and jump directly
// to the sink.
unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
// Reload the previously saved SSP register value.
Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
const int64_t SPPOffset = 3 * PVT.getStoreSize();
MachineInstrBuilder MIB =
BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (i == X86::AddrDisp)
MIB.addDisp(MO, SPPOffset);
else if (MO.isReg()) // Don't add the whole operand, we don't want to
// preserve kill flags.
// Subtract the current SSP from the previous SSP.
Register SspSubReg = MRI.createVirtualRegister(PtrRC);
unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
// Jump to sink in case PrevSSPReg <= SSPCopyReg.
BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
// Increase SSP when looking only on the lower 8 bits of the delta.
unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
// Reset the lower 8 bits.
Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
// Jump if the result of the shift is zero.
BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
// Do a single shift left.
unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
// Save the value 128 to a register (will be used next with incssp).
Register Value128InReg = MRI.createVirtualRegister(PtrRC);
unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
// Since incssp only looks at the lower 8 bits, we might need to do several
// iterations of incssp until we finish fixing the shadow stack.
Register DecReg = MRI.createVirtualRegister(PtrRC);
Register CounterReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
// Every iteration we increase the SSP by 128.
BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
// Every iteration we decrement the counter by 1.
unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
// Jump if the counter is not zero yet.
BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
return sinkMBB;
MachineBasicBlock *
X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
"Invalid Pointer Size!");
const TargetRegisterClass *RC =
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
Register Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
Register SP = RegInfo->getStackRegister();
MachineInstrBuilder MIB;
const int64_t LabelOffset = 1 * PVT.getStoreSize();
const int64_t SPOffset = 2 * PVT.getStoreSize();
unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
MachineBasicBlock *thisMBB = MBB;
// When CET and shadow stack is enabled, we need to fix the Shadow Stack.
if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
// Reload FP
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (MO.isReg()) // Don't add the whole operand, we don't want to
// preserve kill flags.
// Reload IP
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (i == X86::AddrDisp)
MIB.addDisp(MO, LabelOffset);
else if (MO.isReg()) // Don't add the whole operand, we don't want to
// preserve kill flags.
// Reload SP
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(i), SPOffset);
MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
// the last instruction of the expansion.
// Jump
BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
return thisMBB;
void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
MachineBasicBlock *MBB,
MachineBasicBlock *DispatchBB,
int FI) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
unsigned Op = 0;
unsigned VR = 0;
bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
if (UseImmLabel) {
Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
} else {
const TargetRegisterClass *TRC =
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
VR = MRI->createVirtualRegister(TRC);
Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
if (Subtarget.is64Bit())
BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
.addReg(0) /* TII->getGlobalBaseReg(MF) */
.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
if (UseImmLabel)
MachineBasicBlock *
X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *BB) const {
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
int FI = MF->getFrameInfo().getFunctionContextIndex();
// Get a mapping of the call site numbers to all of the landing pads they're
// associated with.
DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
unsigned MaxCSNum = 0;
for (auto &MBB : *MF) {
if (!MBB.isEHPad())
MCSymbol *Sym = nullptr;
for (const auto &MI : MBB) {
if (MI.isDebugInstr())
assert(MI.isEHLabel() && "expected EH_LABEL");
Sym = MI.getOperand(0).getMCSymbol();
if (!MF->hasCallSiteLandingPad(Sym))
for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
MaxCSNum = std::max(MaxCSNum, CSI);
// Get an ordered list of the machine basic blocks for the jump table.
std::vector<MachineBasicBlock *> LPadList;
SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
for (auto &LP : CallSiteNumToLPad[CSI]) {
InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
assert(!LPadList.empty() &&
"No landing pad destinations for the dispatch jump table!");
// Create the MBBs for the dispatch code.
// Shove the dispatch's address into the return slot in the function context.
MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
BuildMI(TrapBB, DL, TII->get(X86::TRAP));
MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
// Insert MBBs.
// Insert code into the entry block that creates and registers the function
// context.
SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
// Create the jump table and associated information
unsigned JTE = getJumpTableEncoding();
MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
unsigned MJTI = JTI->createJumpTableIndex(LPadList);
const X86RegisterInfo &RI = TII->getRegisterInfo();
// Add a register mask with no preserved registers. This results in all
// registers being marked as clobbered.
if (RI.hasBasePointer(*MF)) {
const bool FPIs64Bit =
Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
Register FP = RI.getFrameRegister(*MF);
Register BP = RI.getBaseRegister();
unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
} else {
BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
// IReg is used as an index in a memory operand and therefore can't be SP
Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
Subtarget.is64Bit() ? 8 : 4);
BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
if (Subtarget.is64Bit()) {
Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
// leaq .LJTI0_0(%rip), BReg
BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
// movzx IReg64, IReg
BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
switch (JTE) {
case MachineJumpTableInfo::EK_BlockAddress:
// jmpq *(BReg,IReg64,8)
BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
case MachineJumpTableInfo::EK_LabelDifference32: {
Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
// movl (BReg,IReg64,4), OReg
BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
// movsx OReg64, OReg
BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
// addq BReg, OReg64, TReg
BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
// jmpq *TReg
BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
llvm_unreachable("Unexpected jump table encoding");
} else {
// jmpl *.LJTI0_0(,IReg,4)
BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
// Add the jump table entries as successors to the MBB.
SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
for (auto &LP : LPadList)
if (SeenMBBs.insert(LP).second)
// N.B. the order the invoke BBs are processed in doesn't matter here.
SmallVector<MachineBasicBlock *, 64> MBBLPads;
const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
for (MachineBasicBlock *MBB : InvokeBBs) {
// Remove the landing pad successor from the invoke block and replace it
// with the new dispatch block.
// Keep a copy of Successors since it's modified inside the loop.
SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
// FIXME: Avoid quadratic complexity.
for (auto MBBS : Successors) {
if (MBBS->isEHPad()) {
// Find the invoke call and mark all of the callee-saved registers as
// 'implicit defined' so that they're spilled. This prevents code from
// moving instructions to before the EH block, where they will never be
// executed.
for (auto &II : reverse(*MBB)) {
if (!II.isCall())
DenseMap<unsigned, bool> DefRegs;
for (auto &MOp : II.operands())
if (MOp.isReg())
DefRegs[MOp.getReg()] = true;
MachineInstrBuilder MIB(*MF, &II);
for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
unsigned Reg = SavedRegs[RegIdx];
if (!DefRegs[Reg])
MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
// Mark all former landing pads as non-landing pads. The dispatch is the only
// landing pad now.
for (auto &LP : MBBLPads)
// The instruction is gone now.
return BB;
MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
auto TMMImmToTMMReg = [](unsigned Imm) {
assert (Imm < 8 && "Illegal tmm index");
return X86::TMM0 + Imm;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32:
case X86::TLS_addr64:
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
return EmitLoweredTLSAddr(MI, BB);
return EmitLoweredIndirectThunk(MI, BB);
case X86::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
case X86::SEG_ALLOCA_32:
case X86::SEG_ALLOCA_64:
return EmitLoweredSegAlloca(MI, BB);
case X86::PROBED_ALLOCA_32:
case X86::PROBED_ALLOCA_64:
return EmitLoweredProbedAlloca(MI, BB);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_FR32:
case X86::CMOV_FR32X:
case X86::CMOV_FR64:
case X86::CMOV_FR64X:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
case X86::CMOV_VR64:
case X86::CMOV_VR128:
case X86::CMOV_VR128X:
case X86::CMOV_VR256:
case X86::CMOV_VR256X:
case X86::CMOV_VR512:
case X86::CMOV_VK1:
case X86::CMOV_VK2:
case X86::CMOV_VK4:
case X86::CMOV_VK8:
case X86::CMOV_VK16:
case X86::CMOV_VK32:
case X86::CMOV_VK64:
return EmitLoweredSelect(MI, BB);
case X86::RDFLAGS32:
case X86::RDFLAGS64: {
unsigned PushF =
MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
// Permit reads of the EFLAGS and DF registers without them being defined.
// This intrinsic exists to read external processor state in flags, such as
// the trap flag, interrupt flag, and direction flag, none of which are
// modeled by the backend.
assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
"Unexpected register in operand!");
assert(Push->getOperand(3).getReg() == X86::DF &&
"Unexpected register in operand!");
BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
case X86::WRFLAGS32:
case X86::WRFLAGS64: {
unsigned Push =
MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
unsigned PopF =
MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
BuildMI(*BB, MI, DL, TII->get(PopF));
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
case X86::FP32_TO_INT16_IN_MEM:
case X86::FP32_TO_INT32_IN_MEM:
case X86::FP32_TO_INT64_IN_MEM:
case X86::FP64_TO_INT16_IN_MEM:
case X86::FP64_TO_INT32_IN_MEM:
case X86::FP64_TO_INT64_IN_MEM:
case X86::FP80_TO_INT16_IN_MEM:
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
int OrigCWFrameIdx =
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
// Load the old value of the control word...
Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
.addReg(OldCW, RegState::Kill).addImm(0xC00);
// Extract to 16 bits.
Register NewCW16 =
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
// Prepare memory for FLDCW.
int NewCWFrameIdx =
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
.addReg(NewCW16, RegState::Kill);
// Reload the modified control word now...
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FLDCW16m)), NewCWFrameIdx);
// Get the X86 opcode to use.
unsigned Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
X86AddressMode AM = getAddressFromInstr(&MI, 0);
addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
// Reload the original control word now.
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
// xbegin
case X86::XBEGIN:
return emitXBegin(MI, BB, Subtarget.getInstrInfo());
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
case X86::VAARG_64:
return EmitVAARG64WithCustomInserter(MI, BB);
case X86::EH_SjLj_SetJmp32:
case X86::EH_SjLj_SetJmp64:
return emitEHSjLjSetJmp(MI, BB);
case X86::EH_SjLj_LongJmp32:
case X86::EH_SjLj_LongJmp64:
return emitEHSjLjLongJmp(MI, BB);
case X86::Int_eh_sjlj_setup_dispatch:
return EmitSjLjDispatchBlock(MI, BB);
case TargetOpcode::STATEPOINT:
// As an implementation detail, STATEPOINT shares the STACKMAP format at
// this point in the process. We diverge later.
return emitPatchPoint(MI, BB);
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
case TargetOpcode::PATCHABLE_EVENT_CALL:
return emitXRayCustomEvent(MI, BB);
return emitXRayTypedEvent(MI, BB);
case X86::LCMPXCHG8B: {
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
// requires a memory operand. If it happens that current architecture is
// i686 and for current function we need a base pointer
// - which is ESI for i686 - register allocator would not be able to
// allocate registers for an address in form of X(%reg, %reg, Y)
// - there never would be enough unreserved registers during regalloc
// (without the need for base ptr the only option would be X(%edi, %esi, Y).
// We are giving a hand to register allocator by precomputing the address in
// a new vreg using LEA.
// If it is not i686 or there is no base pointer - nothing to do here.
if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
return BB;
// Even though this code does not necessarily needs the base pointer to
// be ESI, we check for that. The reason: if this assert fails, there are
// some changes happened in the compiler base pointer handling, which most
// probably have to be addressed somehow here.
assert(TRI->getBaseRegister() == X86::ESI &&
"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind");
MachineRegisterInfo &MRI = MF->getRegInfo();
MVT SPTy = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
X86AddressMode AM = getAddressFromInstr(&MI, 0);
// Regalloc does not need any help when the memory operand of CMPXCHG8B
// does not use index register.
if (AM.IndexReg == X86::NoRegister)
return BB;
// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
// four operand definitions that are E[ABCD] registers. We skip them and
// then insert the LEA.
MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
RMBBI->definesRegister(X86::EBX) ||
RMBBI->definesRegister(X86::ECX) ||
RMBBI->definesRegister(X86::EDX))) {
MachineBasicBlock::iterator MBBI(RMBBI);
BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
setDirectAddressInInstr(&MI, 0, computedAddrVReg);
return BB;
case X86::LCMPXCHG16B:
return BB;
unsigned BasePtr =
MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
if (!BB->isLiveIn(BasePtr))
return BB;
case TargetOpcode::PREALLOCATED_SETUP: {
assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
auto MFI = MF->getInfo<X86MachineFunctionInfo>();
int64_t PreallocatedId = MI.getOperand(0).getImm();
size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
assert(StackAdjustment != 0 && "0 stack adjustment");
LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n");
BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
return BB;
case TargetOpcode::PREALLOCATED_ARG: {
assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
int64_t PreallocatedId = MI.getOperand(1).getImm();
int64_t ArgIdx = MI.getOperand(2).getImm();
auto MFI = MF->getInfo<X86MachineFunctionInfo>();
size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
<< ", arg offset " << ArgOffset << "\n");
// stack pointer + offset
BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
X86::ESP, false, ArgOffset);
return BB;
case X86::PTDPBSSD:
case X86::PTDPBSUD:
case X86::PTDPBUSD:
case X86::PTDPBUUD:
case X86::PTDPBF16PS: {
const DebugLoc &DL = MI.getDebugLoc();
unsigned Opc;
switch (MI.getOpcode()) {
case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
case X86::PTILEZERO: {
const DebugLoc &DL = MI.getDebugLoc();
unsigned Imm = MI.getOperand(0).getImm();
BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
case X86::PTILESTORED: {
const DebugLoc &DL = MI.getDebugLoc();
unsigned Opc;
switch (MI.getOpcode()) {
case X86::PTILELOADD: Opc = X86::TILELOADD; break;
case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
case X86::PTILESTORED: Opc = X86::TILESTORED; break;
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
unsigned CurOp = 0;
if (Opc != X86::TILESTORED)
MIB.add(MI.getOperand(CurOp++)); // base
MIB.add(MI.getOperand(CurOp++)); // scale
MIB.add(MI.getOperand(CurOp++)); // index -- stride
MIB.add(MI.getOperand(CurOp++)); // displacement
MIB.add(MI.getOperand(CurOp++)); // segment
if (Opc == X86::TILESTORED)
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
// X86 Optimization Hooks
X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
const APInt &DemandedBits,
const APInt &DemandedElts,
TargetLoweringOpt &TLO) const {
EVT VT = Op.getValueType();
unsigned Opcode = Op.getOpcode();
unsigned EltSize = VT.getScalarSizeInBits();
if (VT.isVector()) {
// If the constant is only all signbits in the active bits, then we should
// extend it to the entire constant to allow it act as a boolean constant
// vector.
auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
return false;
for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
if (!DemandedElts[i] || V.getOperand(i).isUndef())
const APInt &Val = V.getConstantOperandAPInt(i);
if (Val.getBitWidth() > Val.getNumSignBits() &&
Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
return true;
return false;
// For vectors - if we have a constant, then try to sign extend.
// TODO: Handle AND/ANDN cases.
unsigned ActiveBits = DemandedBits.getActiveBits();
if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
(Opcode == ISD::OR || Opcode == ISD::XOR) &&
NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
SDValue NewC =
Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
SDValue NewOp =
TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
return TLO.CombineTo(Op, NewOp);
return false;
// Only optimize Ands to prevent shrinking a constant that could be
// matched by movzx.
if (Opcode != ISD::AND)
return false;
// Make sure the RHS really is a constant.
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!C)
return false;
const APInt &Mask = C->getAPIntValue();
// Clear all non-demanded bits initially.
APInt ShrunkMask = Mask & DemandedBits;
// Find the width of the shrunk mask.
unsigned Width = ShrunkMask.getActiveBits();
// If the mask is all 0s there's nothing to do here.
if (Width == 0)
return false;
// Find the next power of 2 width, rounding up to a byte.
Width = PowerOf2Ceil(std::max(Width, 8U));
// Truncate the width to size to handle illegal types.
Width = std::min(Width, EltSize);
// Calculate a possible zero extend mask for this constant.
APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
// If we aren't changing the mask, just return true to keep it and prevent
// the caller from optimizing.
if (ZeroExtendMask == Mask)
return true;
// Make sure the new mask can be represented by a combination of mask bits
// and non-demanded bits.
if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
return false;
// Replace the constant with the zero extend mask.
SDLoc DL(Op);
SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
return TLO.CombineTo(Op, NewOp);
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
unsigned BitWidth = Known.getBitWidth();
unsigned NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert((Opc >= ISD::BUILTIN_OP_END ||
"Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!");
switch (Opc) {
default: break;
case X86ISD::SETCC:
case X86ISD::MOVMSK: {
unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
case X86ISD::PEXTRB:
case X86ISD::PEXTRW: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
Known = Known.anyextOrTrunc(BitWidth);
case X86ISD::VSRAI:
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
unsigned ShAmt = Op.getConstantOperandVal(1);
if (ShAmt >= VT.getScalarSizeInBits()) {
Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (Opc == X86ISD::VSHLI) {
Known.Zero <<= ShAmt;
Known.One <<= ShAmt;
// Low bits are known zero.
} else if (Opc == X86ISD::VSRLI) {
// High bits are known zero.
} else {
case X86ISD::PACKUS: {
// PACKUS is just a truncation if the upper half is zero.
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
Known.One = APInt::getAllOnesValue(BitWidth * 2);
Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
KnownBits Known2;
if (!!DemandedLHS) {
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
if (!!DemandedRHS) {
Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
if (Known.countMinLeadingZeros() < BitWidth)
Known = Known.trunc(BitWidth);
case X86ISD::ANDNP: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// ANDNP = (~X & Y);
Known.One &= Known2.Zero;
Known.Zero |= Known2.One;
case X86ISD::FOR: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known |= Known2;
case X86ISD::PSADBW: {
assert(VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types");
// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
case X86ISD::CMOV: {
Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
// If we don't know any bits, early out.
if (Known.isUnknown())
KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
case X86ISD::BEXTR: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
// If the length is 0, the result is 0.
if (Length == 0) {
if ((Shift + Length) <= BitWidth) {
Known = DAG.computeKnownBits(Op0, Depth + 1);
Known = Known.extractBits(Length, Shift);
Known = Known.zextOrTrunc(BitWidth);
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P:
case X86ISD::CVTP2SI:
case X86ISD::CVTP2UI:
case X86ISD::MCVTP2SI:
case X86ISD::MCVTP2UI:
case X86ISD::CVTTP2SI:
case X86ISD::CVTTP2UI:
case X86ISD::MCVTSI2P:
case X86ISD::MCVTUI2P:
case X86ISD::CVTPS2PH:
case X86ISD::MCVTPS2PH: {
// Conversions - upper elements are known zero.
EVT SrcVT = Op.getOperand(0).getValueType();
if (SrcVT.isVector()) {
unsigned NumSrcElts = SrcVT.getVectorNumElements();
if (NumElts > NumSrcElts &&
DemandedElts.countTrailingZeros() >= NumSrcElts)
// Strict Conversions - upper elements are known zero.
EVT SrcVT = Op.getOperand(1).getValueType();
if (SrcVT.isVector()) {
unsigned NumSrcElts = SrcVT.getVectorNumElements();
if (NumElts > NumSrcElts &&
DemandedElts.countTrailingZeros() >= NumSrcElts)
case X86ISD::MOVQ2DQ: {
// Move from MMX to XMM. Upper half of XMM should be 0.
if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
// Handle target shuffles.
// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
if (isTargetShuffle(Opc)) {
bool IsUnary;
SmallVector<int, 64> Mask;
SmallVector<SDValue, 2> Ops;
if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
IsUnary)) {
unsigned NumOps = Ops.size();
unsigned NumElts = VT.getVectorNumElements();
if (Mask.size() == NumElts) {
SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
Known.Zero.setAllBits(); Known.One.setAllBits();
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
int M = Mask[i];
if (M == SM_SentinelUndef) {
// For UNDEF elements, we don't know anything about the common state
// of the shuffle result.
} else if (M == SM_SentinelZero) {
assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range");
unsigned OpIdx = (unsigned)M / NumElts;
unsigned EltIdx = (unsigned)M % NumElts;
if (Ops[OpIdx].getValueType() != VT) {
// TODO - handle target shuffle ops with different value types.
// Known bits are the values that are shared by every demanded element.
for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
if (!DemandedOps[i])
KnownBits Known2 =
DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
Known.One &= Known2.One;
Known.Zero &= Known2.Zero;
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
unsigned Depth) const {
EVT VT = Op.getValueType();
unsigned VTBits = VT.getScalarSizeInBits();
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
return VTBits;
case X86ISD::VTRUNC: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
assert(VTBits < NumSrcBits && "Illegal truncation input type");
APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
if (Tmp > (NumSrcBits - VTBits))
return Tmp - (NumSrcBits - VTBits);
return 1;
case X86ISD::PACKSS: {
// PACKSS is just a truncation if the sign bits extend to the packed size.
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
if (!!DemandedLHS)
Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
if (!!DemandedRHS)
Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
unsigned Tmp = std::min(Tmp0, Tmp1);
if (Tmp > (SrcBits - VTBits))
return Tmp - (SrcBits - VTBits);
return 1;
case X86ISD::VSHLI: {
SDValue Src = Op.getOperand(0);
const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits))
return VTBits; // Shifted all bits out --> zero.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
if (ShiftVal.uge(Tmp))
return 1; // Shifted all sign bits out --> unknown.
return Tmp - ShiftVal.getZExtValue();
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
APInt ShiftVal = Op.getConstantOperandAPInt(1);
if (ShiftVal.uge(VTBits - 1))
return VTBits; // Sign splat.
unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
ShiftVal += Tmp;
return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
case X86ISD::PCMPGT:
case X86ISD::PCMPEQ:
case X86ISD::CMPP:
case X86ISD::VPCOM:
case X86ISD::VPCOMU:
// Vector compares return zero/all-bits result values.
return VTBits;
case X86ISD::ANDNP: {
unsigned Tmp0 =
DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (Tmp0 == 1) return 1; // Early out.
unsigned Tmp1 =
DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
return std::min(Tmp0, Tmp1);
case X86ISD::CMOV: {
unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
if (Tmp0 == 1) return 1; // Early out.
unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
return std::min(Tmp0, Tmp1);
// Handle target shuffles.
// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
if (isTargetShuffle(Opcode)) {
bool IsUnary;
SmallVector<int, 64> Mask;
SmallVector<SDValue, 2> Ops;
if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
IsUnary)) {
unsigned NumOps = Ops.size();
unsigned NumElts = VT.getVectorNumElements();
if (Mask.size() == NumElts) {
SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
int M = Mask[i];
if (M == SM_SentinelUndef) {
// For UNDEF elements, we don't know anything about the common state
// of the shuffle result.
return 1;
} else if (M == SM_SentinelZero) {
// Zero = all sign bits.
assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range");
unsigned OpIdx = (unsigned)M / NumElts;
unsigned EltIdx = (unsigned)M % NumElts;
if (Ops[OpIdx].getValueType() != VT) {
// TODO - handle target shuffle ops with different value types.
return 1;
unsigned Tmp0 = VTBits;
for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
if (!DemandedOps[i])
unsigned Tmp1 =
DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
Tmp0 = std::min(Tmp0, Tmp1);
return Tmp0;
// Fallback case.
return 1;
SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
return N->getOperand(0);
return N;
// Helper to look for a normal load that can be narrowed into a vzload with the
// specified VT and memory VT. Returns SDValue() on failure.
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
SelectionDAG &DAG) {
// Can't if the load is volatile or atomic.
if (!LN->isSimple())
return SDValue();
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
LN->getPointerInfo(), LN->getOriginalAlign(),
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool AllowFloatDomain, bool AllowIntDomain,
SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget, unsigned &Shuffle,
MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
Shuffle = X86ISD::VZEXT_MOVL;
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
return true;
// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
bool MatchAny = true;
bool MatchZero = true;
unsigned NumDstElts = NumMaskElts / Scale;
for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
MatchAny = MatchZero = false;
MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
if (MatchAny || MatchZero) {
assert(MatchZero && "Failed to match zext but matched aext?");
unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
if (SrcVT.getVectorNumElements() != NumDstElts)
Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
return true;
// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
isUndefOrEqual(Mask[0], 0) &&
isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
Shuffle = X86ISD::VZEXT_MOVL;
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
return true;
// Check if we have SSE3 which will let us use MOVDDUP etc. The
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
if (MaskVT.is256BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v4f64;
return true;
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
if (MaskVT.is512BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v8f64;
return true;
if (isTargetShuffleEquivalent(
Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
if (isTargetShuffleEquivalent(
Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
return false;
// Attempt to match a combined shuffle mask against supported unary immediate
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
const APInt &Zeroable,
bool AllowFloatDomain, bool AllowIntDomain,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
unsigned InputSizeInBits = MaskVT.getSizeInBits();
unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
bool ContainsZeros = isAnyZero(Mask);
// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
if (!ContainsZeros && MaskScalarSizeInBits == 64) {
// Check for lane crossing permutes.
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
PermuteImm = getV4X86ShuffleImm(Mask);
return true;
if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
SmallVector<int, 4> RepeatedMask;
if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
Shuffle = X86ISD::VPERMI;
ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
PermuteImm = getV4X86ShuffleImm(RepeatedMask);
return true;
} else if (AllowFloatDomain && Subtarget.hasAVX()) {
// VPERMILPD can permute with a non-repeating shuffle.
Shuffle = X86ISD::VPERMILPI;
ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
PermuteImm = 0;
for (int i = 0, e = Mask.size(); i != e; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
PermuteImm |= (M & 1) << i;
return true;
// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
// Narrow the repeated mask to create 32-bit element permutes.
SmallVector<int, 4> WordMask = RepeatedMask;
if (MaskScalarSizeInBits == 64)
narrowShuffleMaskElts(2, RepeatedMask, WordMask);
Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
PermuteImm = getV4X86ShuffleImm(WordMask);
return true;
// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
ArrayRef<int> LoMask( + 0, 4);
ArrayRef<int> HiMask( + 4, 4);
// PSHUFLW: permute lower 4 elements only.
if (isUndefOrInRange(LoMask, 0, 4) &&
isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
Shuffle = X86ISD::PSHUFLW;
ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
PermuteImm = getV4X86ShuffleImm(LoMask);
return true;
// PSHUFHW: permute upper 4 elements only.
if (isUndefOrInRange(HiMask, 4, 8) &&
isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
// Offset the HiMask so that we can create the shuffle immediate.
int OffsetHiMask[4];
for (int i = 0; i != 4; ++i)
OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
Shuffle = X86ISD::PSHUFHW;
ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
return true;
// Attempt to match against byte/bit shifts.
if (AllowIntDomain &&
((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
Mask, 0, Zeroable, Subtarget);
if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
32 <= ShuffleVT.getScalarSizeInBits())) {
PermuteImm = (unsigned)ShiftAmt;
return true;
// Attempt to match against bit rotates.
if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
Subtarget.hasAVX512())) {
int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
Subtarget, Mask);
if (0 < RotateAmt) {
Shuffle = X86ISD::VROTLI;
PermuteImm = (unsigned)RotateAmt;
return true;
return false;
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
bool AllowFloatDomain, bool AllowIntDomain,
SDValue &V1, SDValue &V2, const SDLoc &DL,
SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
bool IsUnary) {
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
V2 = V1;
V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
V2 = V1;
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
SrcVT = DstVT = MVT::v2f64;
return true;
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
SrcVT = DstVT = MVT::v4f32;
return true;
// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
Subtarget)) {
DstVT = MaskVT;
return true;
// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
Subtarget)) {
SrcVT = DstVT = MaskVT;
if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
return true;
return false;
static bool matchBinaryPermuteShuffle(
MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
// Attempt to match against VALIGND/VALIGNQ rotate.
if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
if (!isAnyZero(Mask)) {
int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
if (0 < Rotation) {
Shuffle = X86ISD::VALIGN;
if (EltSizeInBits == 64)
ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
PermuteImm = Rotation;
return true;
// Attempt to match against PALIGNR byte rotate.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
PermuteImm = ByteRotation;
return true;
// Attempt to combine to X86ISD::BLENDI.
if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
ForceV2Zero, BlendMask)) {
if (MaskVT == MVT::v16i16) {
// We can only use v16i16 PBLENDW if the lanes are repeated.
SmallVector<int, 8> RepeatedMask;
if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
RepeatedMask)) {
assert(RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!");
PermuteImm = 0;
for (int i = 0; i < 8; ++i)
if (RepeatedMask[i] >= 8)
PermuteImm |= 1 << i;
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
Shuffle = X86ISD::BLENDI;
ShuffleVT = MaskVT;
return true;
} else {
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
ShuffleVT = MaskVT;
return true;
// Attempt to combine to INSERTPS, but only if it has elements that need to
// be set to zero.
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
MaskVT.is128BitVector() && isAnyZero(Mask) &&
matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
return true;
// Attempt to combine to SHUFPD.
if (AllowFloatDomain && EltSizeInBits == 64 &&
((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
bool ForceV1Zero = false, ForceV2Zero = false;
if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
PermuteImm, Mask, Zeroable)) {
V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
// Attempt to combine to SHUFPS.
if (AllowFloatDomain && EltSizeInBits == 32 &&
((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
SmallVector<int, 4> RepeatedMask;
if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
// Match each half of the repeated mask, to determine if its just
// referencing one of the vectors, is zeroable or entirely undef.
auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
int M0 = RepeatedMask[Offset];
int M1 = RepeatedMask[Offset + 1];
if (isUndefInRange(RepeatedMask, Offset, 2)) {
return DAG.getUNDEF(MaskVT);
} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
S0 = (SM_SentinelUndef == M0 ? -1 : 0);
S1 = (SM_SentinelUndef == M1 ? -1 : 1);
return getZeroVector(MaskVT, Subtarget, DAG, DL);
} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
return V1;
} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
return V2;
return SDValue();
int ShufMask[4] = {-1, -1, -1, -1};
SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
if (Lo && Hi) {
V1 = Lo;
V2 = Hi;
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
PermuteImm = getV4X86ShuffleImm(ShufMask);
return true;
// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
MaskVT.is128BitVector() &&
matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
return true;
return false;
static SDValue combineX86ShuffleChainWithExtract(
ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget);
/// Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
/// This is the leaf of the recursive combine below. When we have found some
/// chain of single-use x86 shuffle instructions and accumulated the combined
/// shuffle mask represented by them, this will try to pattern match that mask
/// into either a single instruction if there is a special purpose instruction
/// for this operation, or into a PSHUFB instruction which is a fully general
/// instruction but should only be used to replace chains over a certain depth.
static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask,
bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
"Unexpected number of shuffle inputs!");
// Find the inputs that enter the chain. Note that multiple uses are OK
// here, we're not going to remove the operands we find.
bool UnaryShuffle = (Inputs.size() == 1);
SDValue V1 = peekThroughBitcasts(Inputs[0]);
SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
: peekThroughBitcasts(Inputs[1]));
MVT VT1 = V1.getSimpleValueType();
MVT VT2 = V2.getSimpleValueType();
MVT RootVT = Root.getSimpleValueType();
assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
VT2.getSizeInBits() == RootVT.getSizeInBits() &&
"Vector size mismatch");
SDLoc DL(Root);
SDValue Res;
unsigned NumBaseMaskElts = BaseMask.size();
if (NumBaseMaskElts == 1) {
assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
return DAG.getBitcast(RootVT, V1);
bool OptForSize = DAG.shouldOptForSize();
unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
(RootVT.isFloatingPoint() && Depth >= 1) ||
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
// Don't combine if we are a AVX512/EVEX target and the mask element size
// is different from the root element size - this would prevent writemasks
// from being reused.
bool IsMaskedShuffle = false;
if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
IsMaskedShuffle = true;
// If we are shuffling a broadcast (and not introducing zeros) then
// we can just use the broadcast directly. This works for smaller broadcast
// elements as well as they already repeat across each mask element
if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {
return DAG.getBitcast(RootVT, V1);
// Attempt to match a subvector broadcast.
// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
if (UnaryShuffle &&
(BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
SDValue Src = Inputs[0];
if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
Src.getOperand(0).isUndef() &&
Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
// Handle 128/256-bit lane shuffles of 512-bit vectors.
if (RootVT.is512BitVector() &&
(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
// If the upper subvectors are zeroable, then an extract+insert is more
// optimal than using X86ISD::SHUF128. The insertion is free, even if it has
// to zero the upper subvectors.
if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
return SDValue(); // Nothing to do!
assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
"Unexpected lane shuffle");
Res = DAG.getBitcast(ShuffleVT, V1);
unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
bool UseZero = isAnyZero(BaseMask);
Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
return DAG.getBitcast(RootVT, Res);
// Narrow shuffle mask to v4x128.
SmallVector<int, 4> Mask;
assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
// Try to lower to vshuf64x2/vshuf32x4.
auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
SDValue V1, SDValue V2, SelectionDAG &DAG) {
unsigned PermMask = 0;
// Insure elements came from the same Op.
SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
for (int i = 0; i < 4; ++i) {
assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
if (Mask[i] < 0)
SDValue Op = Mask[i] >= 4 ? V2 : V1;
unsigned OpIndex = i / 2;
if (Ops[OpIndex].isUndef())
Ops[OpIndex] = Op;
else if (Ops[OpIndex] != Op)
return SDValue();
// Convert the 128-bit shuffle mask selection values into 128-bit
// selection bits defined by a vshuf64x2 instruction's immediate control
// byte.
PermMask |= (Mask[i] % 4) << (i * 2);
return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
DAG.getBitcast(ShuffleVT, Ops[0]),
DAG.getBitcast(ShuffleVT, Ops[1]),
DAG.getTargetConstant(PermMask, DL, MVT::i8));
// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
// doesn't work because our mask is for 128 bits and we don't have an MVT
// to match that.
bool PreferPERMQ =
UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
isUndefOrInRange(Mask[3], 2, 4) &&
(Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
(Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
if (!isAnyZero(Mask) && !PreferPERMQ) {
if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
return DAG.getBitcast(RootVT, V);
// Handle 128-bit lane shuffles of 256-bit vectors.
if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
// If the upper half is zeroable, then an extract+insert is more optimal
// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
// zero the upper half.
if (isUndefOrZero(BaseMask[1])) {
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
return SDValue(); // Nothing to do!
assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
Res = DAG.getBitcast(ShuffleVT, V1);
Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
DL, 256);
return DAG.getBitcast(RootVT, Res);
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
// we need to use the zeroing feature.
// Prefer blends for sequential shuffles unless we are optimizing for size.
if (UnaryShuffle &&
!(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
(OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
Res = DAG.getBitcast(ShuffleVT, V1);
Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
DAG.getTargetConstant(PermMask, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
return SDValue(); // Nothing to do!
// TODO - handle AVX512VL cases with X86ISD::SHUF128.
if (!UnaryShuffle && !IsMaskedShuffle) {
assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
"Unexpected shuffle sentinel value");
// Prefer blends to X86ISD::VPERM2X128.
if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
(BaseMask[0] == 2 && BaseMask[1] == 1))) {
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] & 3) << 0);
PermMask |= ((BaseMask[1] & 3) << 4);
Res = DAG.getNode(
X86ISD::VPERM2X128, DL, ShuffleVT,
DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
DAG.getTargetConstant(PermMask, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
// For masks that have been widened to 128-bit elements or more,
// narrow back down to 64-bit elements.
SmallVector<int, 64> Mask;
if (BaseMaskEltSizeInBits > 64) {
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
int MaskScale = BaseMaskEltSizeInBits / 64;
narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
} else {
Mask.assign(BaseMask.begin(), BaseMask.end());
// For masked shuffles, we're trying to match the root width for better
// writemask folding, attempt to scale the mask.
// TODO - variable shuffles might need this to be widened again.
if (IsMaskedShuffle && NumRootElts > Mask.size()) {
assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
int MaskScale = NumRootElts / Mask.size();
SmallVector<int, 64> ScaledMask;
narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
Mask = std::move(ScaledMask);
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
// Determine the effective mask value type.
FloatDomain &= (32 <= MaskEltSizeInBits);
MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
: MVT::getIntegerVT(MaskEltSizeInBits);
MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
// Only allow legal mask types.
if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
return SDValue();
// Attempt to match the mask against known shuffle patterns.
MVT ShuffleSrcVT, ShuffleVT;
unsigned Shuffle, PermuteImm;
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
// TODO: Should we indicate which domain is preferred if both are allowed?
bool AllowFloatDomain = FloatDomain || (Depth >= 3);
bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
// Determine zeroable mask elements.
APInt KnownUndef, KnownZero;
resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
APInt Zeroable = KnownUndef | KnownZero;
if (UnaryShuffle) {
// Attempt to match against broadcast-from-vector.
// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
if ((Subtarget.hasAVX2() ||
(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
if (V1.getValueType() == MaskVT &&
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
MayFoldLoad(V1.getOperand(0))) {
if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = V1.getOperand(0);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
return DAG.getBitcast(RootVT, Res);
if (Subtarget.hasAVX2()) {
if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(MaskVT, V1);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
return DAG.getBitcast(RootVT, Res);
SDValue NewV1 = V1; // Save operand in case early exit happens.
if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT) &&
(!IsMaskedShuffle ||
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
return DAG.getBitcast(RootVT, Res);
if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
PermuteImm) &&
(!IsMaskedShuffle ||
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, V1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
// Attempt to combine to INSERTPS, but only if the inserted element has come
// from a scalar.
// TODO: Handle other insertions here as well?
if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
!isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
SDValue SrcV1 = V1, SrcV2 = V2;
if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
return SDValue(); // Nothing to do!
Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
DAG.getBitcast(MVT::v4f32, SrcV1),
DAG.getBitcast(MVT::v4f32, SrcV2),
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
SDValue NewV1 = V1; // Save operands in case early exit happens.
SDValue NewV2 = V2;
if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT, UnaryShuffle) &&
(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
return DAG.getBitcast(RootVT, Res);
NewV1 = V1; // Save operands in case early exit happens.
NewV2 = V2;
if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
AllowIntDomain, NewV1, NewV2, DL, DAG,
Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
// Typically from here on, we need an integer version of MaskVT.
MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
// Annoyingly, SSE4A instructions don't map into the above match helpers.
if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
uint64_t BitLen, BitIdx;
if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
Zeroable)) {
if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
DAG.getTargetConstant(BitLen, DL, MVT::i8),
DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
V2 = DAG.getBitcast(IntMaskVT, V2);
Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
DAG.getTargetConstant(BitLen, DL, MVT::i8),
DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
// Match shuffle against TRUNCATE patterns.
if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
// Match against a VTRUNC instruction, accounting for src/dst sizes.
if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
Subtarget)) {
bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
unsigned Opc =
IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
if (Depth == 0 && Root.getOpcode() == Opc)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(ShuffleSrcVT, V1);
Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
if (ShuffleVT.getSizeInBits() < RootSizeInBits)
Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
return DAG.getBitcast(RootVT, Res);
// Do we need a more general binary truncation pattern?
if (RootSizeInBits < 512 &&
((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
(RootVT.is128BitVector() && Subtarget.hasVLX())) &&
(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
return SDValue(); // Nothing to do!
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
V1 = DAG.getBitcast(ShuffleSrcVT, V1);
V2 = DAG.getBitcast(ShuffleSrcVT, V2);
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
return DAG.getBitcast(RootVT, Res);
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 1)
return SDValue();
// Depth threshold above which we can efficiently use variable mask shuffles.
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
bool MaskContainsZeros = isAnyZero(Mask);
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX2() &&
(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
Res = DAG.getBitcast(MaskVT, V1);
Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
return DAG.getBitcast(RootVT, Res);
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
// vector as the second source.
if (UnaryShuffle && AllowVariableMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasVLX() &&
(MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
// Adjust shuffle mask - replace SM_SentinelZero with second source index.
for (unsigned i = 0; i != NumMaskElts; ++i)
if (Mask[i] == SM_SentinelZero)
Mask[i] = NumMaskElts + i;
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
Res = DAG.getBitcast(MaskVT, V1);
SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
return DAG.getBitcast(RootVT, Res);
// If that failed and either input is extracted then try to combine as a
// shuffle with the larger type.
if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
DAG, Subtarget))
return WideShuffle;
// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasVLX() &&
(MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
return DAG.getBitcast(RootVT, Res);
return SDValue();
// See if we can combine a single input shuffle with zeros to a bit-mask,
// which is much simpler than any shuffle.
if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
APInt UndefElts(NumMaskElts, 0);
SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
if (M == SM_SentinelZero)
EltBits[i] = AllOnes;
SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
Res = DAG.getBitcast(MaskVT, V1);
unsigned AndOpcode =
MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
return DAG.getBitcast(RootVT, Res);
// If we have a single input shuffle with different shuffle patterns in the
// the 128-bit lanes use the variable mask to VPERMILPS.
// TODO Combine other mask types at higher depths.
if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
SmallVector<SDValue, 16> VPermIdx;
for (int M : Mask) {
SDValue Idx =
M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
Res = DAG.getBitcast(MaskVT, V1);
Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
return DAG.getBitcast(RootVT, Res);
// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
if (AllowVariableMask && Subtarget.hasXOP() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
MaskVT == MVT::v8f32)) {
// VPERMIL2 Operation.
// Bits[3] - Match Bit.
// Bits[2:1] - (Per Lane) PD Shuffle Mask.
// Bits[2:0] - (Per Lane) PS Shuffle Mask.
unsigned NumLanes = MaskVT.getSizeInBits() / 128;
unsigned NumEltsPerLane = NumMaskElts / NumLanes;
SmallVector<int, 8> VPerm2Idx;
unsigned M2ZImm = 0;
for (int M : Mask) {
if (M == SM_SentinelUndef) {
if (M == SM_SentinelZero) {
M2ZImm = 2;
int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
// If we have 3 or more shuffle instructions or a chain involving a variable
// mask, we can replace them with a single PSHUFB instruction profitably.
// Intel's manuals suggest only using PSHUFB if doing so replacing 5
// instructions, but in practice PSHUFB tends to be *very* fast so we're
// more aggressive.
if (UnaryShuffle && AllowVariableMask &&
((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
SmallVector<SDValue, 16> PSHUFBMask;
int NumBytes = RootVT.getSizeInBits() / 8;
int Ratio = NumBytes / NumMaskElts;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / Ratio];
if (M == SM_SentinelUndef) {
if (M == SM_SentinelZero) {
PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
M = Ratio * M + i % Ratio;
assert((M / 16) == (i / 16) && "Lane crossing detected");
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
Res = DAG.getBitcast(ByteVT, V1);
SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
return DAG.getBitcast(RootVT, Res);
// With XOP, if we have a 128-bit binary input shuffle we can always combine
// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
// slower than PSHUFB on targets that support both.
if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
// VPPERM Mask Operation
// Bits[4:0] - Byte Index (0 - 31)
// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
SmallVector<SDValue, 16> VPPERMMask;
int NumBytes = 16;
int Ratio = NumBytes / NumMaskElts;
for (int i = 0; i < NumBytes; ++i) {
int M = Mask[i / Ratio];
if (M == SM_SentinelUndef) {
if (M == SM_SentinelZero) {
VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
M = Ratio * M + i % Ratio;
VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
MVT ByteVT = MVT::v16i8;
V1 = DAG.getBitcast(ByteVT, V1);
V2 = DAG.getBitcast(ByteVT, V2);
SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
return DAG.getBitcast(RootVT, Res);
// If that failed and either input is extracted then try to combine as a
// shuffle with the larger type.
if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
DAG, Subtarget))
return WideShuffle;
// If we have a dual input shuffle then lower to VPERMV3.
if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
(Subtarget.hasVLX() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
(Subtarget.hasBWI() && Subtarget.hasVLX() &&
(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() &&
(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
V1 = DAG.getBitcast(MaskVT, V1);
V2 = DAG.getBitcast(MaskVT, V2);
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
return DAG.getBitcast(RootVT, Res);
// Failed to find any combines.
return SDValue();
// Combine an arbitrary chain of shuffles + extract_subvectors into a single
// instruction if possible.
// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
// type size to attempt to combine:
// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
// -->
// extract_subvector(shuffle(x,y,m2),0)
static SDValue combineX86ShuffleChainWithExtract(
ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NumMaskElts = BaseMask.size();
unsigned NumInputs = Inputs.size();
if (NumInputs == 0)
return SDValue();
SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
SmallVector<unsigned, 4> Offsets(NumInputs, 0);
// Peek through subvectors.
// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
for (unsigned i = 0; i != NumInputs; ++i) {
SDValue &Src = WideInputs[i];
unsigned &Offset = Offsets[i];
Src = peekThroughBitcasts(Src);
EVT BaseVT = Src.getValueType();
while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
Offset += Src.getConstantOperandVal(1);
Src = Src.getOperand(0);
WideSizeInBits = std::max(WideSizeInBits,
assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
"Unexpected subvector extraction");
Offset /= BaseVT.getVectorNumElements();
Offset *= NumMaskElts;
// Bail if we're always extracting from the lowest subvectors,
// combineX86ShuffleChain should match this for the current width.
if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
return SDValue();
EVT RootVT = Root.getValueType();
unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned Scale = WideSizeInBits / RootSizeInBits;
assert((WideSizeInBits % RootSizeInBits) == 0 &&
"Unexpected subvector extraction");
// If the src vector types aren't the same, see if we can extend
// them to match each other.
// TODO: Support different scalar types?
EVT WideSVT = WideInputs[0].getValueType().getScalarType();
if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
Op.getValueType().getScalarType() != WideSVT;
return SDValue();
for (SDValue &NewInput : WideInputs) {
assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch");
if (WideSizeInBits > NewInput.getValueSizeInBits())
NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
SDLoc(NewInput), WideSizeInBits);
assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
"Unexpected subvector extraction");
// Create new mask for larger type.
for (unsigned i = 1; i != NumInputs; ++i)
Offsets[i] += i * Scale * NumMaskElts;
SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
for (int &M : WideMask) {
if (M < 0)
M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
// Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
assert(!WideInputs.empty() && "Shuffle with no inputs detected");
if (WideInputs.size() > 2)
return SDValue();
// Increase depth for every upper subvector we've peeked through.
Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
// Attempt to combine wider chain.
// TODO: Can we use a better Root?
SDValue WideRoot = WideInputs[0];
if (SDValue WideShuffle = combineX86ShuffleChain(
WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
AllowVariableMask, DAG, Subtarget)) {
WideShuffle =
extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
return DAG.getBitcast(RootVT, WideShuffle);
return SDValue();
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
ArrayRef<int> Mask, SDValue Root,
bool HasVariableMask,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Root.getSimpleValueType();
unsigned SizeInBits = VT.getSizeInBits();
unsigned NumMaskElts = Mask.size();
unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
unsigned NumOps = Ops.size();
// Extract constant bits from each source op.
bool OneUseConstantOp = false;
SmallVector<APInt, 16> UndefEltsOps(NumOps);
SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
for (unsigned i = 0; i != NumOps; ++i) {
SDValue SrcOp = Ops[i];
OneUseConstantOp |= SrcOp.hasOneUse();
if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
return SDValue();
// Only fold if at least one of the constants is only used once or
// the combined shuffle has included a variable mask shuffle, this
// is to avoid constant pool bloat.
if (!OneUseConstantOp && !HasVariableMask)
return SDValue();
// Shuffle the constant bits according to the mask.
SDLoc DL(Root);
APInt UndefElts(NumMaskElts, 0);
APInt ZeroElts(NumMaskElts, 0);
APInt ConstantElts(NumMaskElts, 0);
SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
} else if (M == SM_SentinelZero) {
assert(0 <= M && M < (int)(NumMaskElts * NumOps));
unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
if (SrcUndefElts[SrcMaskIdx]) {
auto &SrcEltBits = RawBitsOps[SrcOpIdx];
APInt &Bits = SrcEltBits[SrcMaskIdx];
if (!Bits) {
ConstantBitData[i] = Bits;
assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
// Attempt to create a zero vector.
if ((UndefElts | ZeroElts).isAllOnesValue())
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
// Create the constant data.
if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
return SDValue();
SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
return DAG.getBitcast(VT, CstOp);
/// Fully generic combining of x86 shuffle instructions.
/// This should be the last combine run over the x86 shuffle instructions. Once
/// they have been fully optimized, this will recursively consider all chains
/// of single-use shuffle instructions, build a generic model of the cumulative
/// shuffle operation, and check for simpler instructions which implement this
/// operation. We use this primarily for two purposes:
/// 1) Collapse generic shuffles to specialized single instructions when
/// equivalent. In most cases, this is just an encoding size win, but
/// sometimes we will collapse multiple generic shuffles into a single
/// special-purpose shuffle.
/// 2) Look for sequences of shuffle instructions with 3 or more total
/// instructions, and replace them with the slightly more expensive SSSE3
/// PSHUFB instruction if available. We do this as the last combining step
/// to ensure we avoid using PSHUFB if we can implement the shuffle with
/// a suitable short sequence of other instructions. The PSHUFB will either
/// use a register or have to read from memory and so is slightly (but only
/// slightly) more expensive than the other shuffle instructions.
/// Because this is inherently a quadratic operation (for each shuffle in
/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
/// This should never be an issue in practice as the shuffle lowering doesn't
/// produce sequences of more than 8 instructions.
/// FIXME: We will currently miss some cases where the redundant shuffling
/// would simplify under the threshold for PSHUFB formation because of
/// combine-ordering. To fix this, we should do the redundant instruction
/// combining in this recursive walk.
static SDValue combineX86ShufflesRecursively(
ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(RootMask.size() > 0 &&
(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
"Illegal shuffle root mask");
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
const unsigned MaxRecursionDepth = 8;
if (Depth >= MaxRecursionDepth)
return SDValue();
// Directly rip through bitcasts to find the underlying operand.
SDValue Op = SrcOps[SrcOpIndex];
Op = peekThroughOneUseBitcasts(Op);
MVT VT = Op.getSimpleValueType();
if (!VT.isVector())
return SDValue(); // Bail if we hit a non-vector.
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
assert(VT.getSizeInBits() == RootSizeInBits &&
"Can only combine shuffles of the same vector register size.");
// Extract target shuffle mask and resolve sentinels and inputs.
// TODO - determine Op's demanded elts from RootMask.
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
APInt OpUndef, OpZero;
APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
OpZero, DAG, Depth, false))
return SDValue();
// Shuffle inputs must be the same size as the result, bail on any larger
// inputs and widen any smaller inputs.
if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
return Op.getValueSizeInBits() > RootSizeInBits;
return SDValue();
for (SDValue &Op : OpInputs)
if (Op.getValueSizeInBits() < RootSizeInBits)
Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
SDLoc(Op), RootSizeInBits);
SmallVector<int, 64> Mask;
SmallVector<SDValue, 16> Ops;
// We don't need to merge masks if the root is empty.
bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
if (EmptyRoot) {
// Only resolve zeros if it will remove an input, otherwise we might end
// up in an infinite loop.
bool ResolveKnownZeros = true;
if (!OpZero.isNullValue()) {
APInt UsedInputs = APInt::getNullValue(OpInputs.size());
for (int i = 0, e = OpMask.size(); i != e; ++i) {
int M = OpMask[i];
if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
UsedInputs.setBit(M / OpMask.size());
if (UsedInputs.isAllOnesValue()) {
ResolveKnownZeros = false;
resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
Mask = OpMask;
Ops.append(OpInputs.begin(), OpInputs.end());
} else {
resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
// Add the inputs to the Ops list, avoiding duplicates.
Ops.append(SrcOps.begin(), SrcOps.end());
auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
// Attempt to find an existing match.
SDValue InputBC = peekThroughBitcasts(Input);
for (int i = 0, e = Ops.size(); i < e; ++i)
if (InputBC == peekThroughBitcasts(Ops[i]))
return i;
// Match failed - should we replace an existing Op?
if (InsertionPoint >= 0) {
Ops[InsertionPoint] = Input;
return InsertionPoint;
// Add to the end of the Ops list.
return Ops.size() - 1;
SmallVector<int, 2> OpInputIdx;
for (SDValue OpInput : OpInputs)
AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||
(OpMask.size() > RootMask.size() &&
OpMask.size() % RootMask.size() == 0) ||
OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.");
// This function can be performance-critical, so we rely on the power-of-2
// knowledge that we have about the mask sizes to replace div/rem ops with
// bit-masks and shifts.
assert(isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes");
assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
unsigned RootRatio =
std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
assert((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!");
assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
Mask.resize(MaskWidth, SM_SentinelUndef);
// Merge this shuffle operation's mask into our accumulated mask. Note that
// this shuffle's mask will be the first applied to the input, followed by
// the root mask to get us all the way to the root value arrangement. The
// reason for this order is that we are recursing up the operation chain.
for (unsigned i = 0; i < MaskWidth; ++i) {
unsigned RootIdx = i >> RootRatioLog2;
if (RootMask[RootIdx] < 0) {
// This is a zero or undef lane, we're done.
Mask[i] = RootMask[RootIdx];
unsigned RootMaskedIdx =
RootRatio == 1
? RootMask[RootIdx]
: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
// Just insert the scaled root mask value if it references an input other
// than the SrcOp we're currently inserting.
if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
Mask[i] = RootMaskedIdx;
RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
if (OpMask[OpIdx] < 0) {
// The incoming lanes are zero or undef, it doesn't matter which ones we
// are using.
Mask[i] = OpMask[OpIdx];
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
: (OpMask[OpIdx] << OpRatioLog2) +
(RootMaskedIdx & (OpRatio - 1));
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
Mask[i] = OpMaskedIdx;
// Remove unused/repeated shuffle source ops.
resolveTargetShuffleInputsAndMask(Ops, Mask);
// Handle the all undef/zero cases early.
if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
return DAG.getUNDEF(Root.getValueType());
// TODO - should we handle the mixed zero/undef case as well? Just returning
// a zero mask will lose information on undef elements possibly reducing
// future combine possibilities.
if (all_of(Mask, [](int Idx) { return Idx < 0; }))
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
assert(!Ops.empty() && "Shuffle with no inputs detected");
HasVariableMask |= IsOpVariableMask;
// Update the list of shuffle nodes that have been combined so far.
SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
// See if we can recurse into each shuffle source op (if it's a target
// shuffle). The source op should only be generally combined if it either has
// a single use (i.e. current Op) or all its users have already been combined,
// if not then we can still combine but should prevent generation of variable
// shuffles to avoid constant pool bloat.
// Don't recurse if we already have more source ops than we can combine in
// the remaining recursion depth.
if (Ops.size() < (MaxRecursionDepth - Depth)) {
for (int i = 0, e = Ops.size(); i < e; ++i) {
// For empty roots, we need to resolve zeroable elements before combining
// them with other shuffles.
SmallVector<int, 64> ResolvedMask = Mask;
if (EmptyRoot)
resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
bool AllowVar = false;
if (Ops[i].getNode()->hasOneUse() ||
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
AllowVar = AllowVariableMask;
if (SDValue Res = combineX86ShufflesRecursively(
Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
HasVariableMask, AllowVar, DAG, Subtarget))
return Res;
// Attempt to constant fold all of the constant source ops.
if (SDValue Cst = combineX86ShufflesConstants(
Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
return Cst;
// We can only combine unary and binary shuffle mask cases.
if (Ops.size() <= 2) {
// Minor canonicalization of the accumulated shuffle mask to make it easier
// to match below. All this does is detect masks with sequential pairs of
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
SmallVector<int, 64> WidenedMask;
while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
Mask = std::move(WidenedMask);
// Canonicalization of binary shuffle masks to improve pattern matching by
// commuting the inputs.
if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
std::swap(Ops[0], Ops[1]);
// Finally, try to combine into a single shuffle instruction.
return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
AllowVariableMask, DAG, Subtarget);
// If that failed and any input is extracted then try to combine as a
// shuffle with the larger type.
return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
HasVariableMask, AllowVariableMask,
DAG, Subtarget);
/// Helper entry wrapper to combineX86ShufflesRecursively.
static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
/*HasVarMask*/ false,
/*AllowVarMask*/ true, DAG, Subtarget);
/// Get the PSHUF-style mask from PSHUF node.
/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
/// PSHUF-style masks that can be reused with such instructions.
static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
SmallVector<SDValue, 2> Ops;
bool IsUnary;
bool HaveMask =
getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
// If we have more than 128-bits, only the low 128-bits of shuffle mask
// matter. Check that the upper masks are repeats and remove them.
if (VT.getSizeInBits() > 128) {
int LaneElts = 128 / VT.getScalarSizeInBits();
#ifndef NDEBUG
for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
for (int j = 0; j < LaneElts; ++j)
assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!");
switch (N.getOpcode()) {
case X86ISD::PSHUFD:
return Mask;
return Mask;
Mask.erase(Mask.begin(), Mask.begin() + 4);
for (int &M : Mask)
M -= 4;
return Mask;
llvm_unreachable("No valid shuffle instruction found!");
/// Search for a combinable shuffle across a chain ending in pshufd.
/// We walk up the chain and look for a combinable shuffle, skipping over
/// shuffles that we could hoist this shuffle's transformation past without
/// altering anything.
static SDValue
combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!");
SDLoc DL(N);
// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
// of the shuffles in the chain so that we can form a fresh chain to replace
// this one.
SmallVector<SDValue, 8> Chain;
SDValue V = N.getOperand(0);
for (; V.hasOneUse(); V = V.getOperand(0)) {
switch (V.getOpcode()) {
return SDValue(); // Nothing combined!
// Skip bitcasts as we always know the type for the target specific
// instructions.
case X86ISD::PSHUFD:
// Found another dword shuffle.
// Check that the low words (being shuffled) are the identity in the
// dword shuffle, and the high words are self-contained.
if (Mask[0] != 0 || Mask[1] != 1 ||
!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
return SDValue();
// Check that the high words (being shuffled) are the identity in the
// dword shuffle, and the low words are self-contained.
if (Mask[2] != 2 || Mask[3] != 3 ||
!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
return SDValue();
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
// shuffle into a preceding word shuffle.
if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
V.getSimpleValueType().getVectorElementType() != MVT::i16)
return SDValue();
// Search for a half-shuffle which we can combine with.
unsigned CombineOp =
V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
if (V.getOperand(0) != V.getOperand(1) ||
return SDValue();
V = V.getOperand(0);
do {
switch (V.getOpcode()) {
return SDValue(); // Nothing to combine.
if (V.getOpcode() == CombineOp)
V = V.getOperand(0);
} while (V.hasOneUse());
// Break out of the loop if we break out of the switch.
if (!V.hasOneUse())
// We fell out of the loop without finding a viable combining instruction.
return SDValue();
// Merge this node's mask and our incoming mask.
SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
for (int &M : Mask)
M = VMask[M];
V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Rebuild the chain around this new shuffle.
while (!Chain.empty()) {
SDValue W = Chain.pop_back_val();
if (V.getValueType() != W.getOperand(0).getValueType())
V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
switch (W.getOpcode()) {
llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
case X86ISD::PSHUFD:
V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
if (V.getValueType() != N.getValueType())
V = DAG.getBitcast(N.getValueType(), V);
// Return the new chain to replace N.
return V;
// Attempt to commute shufps LHS loads:
// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
SelectionDAG &DAG) {
// TODO: Add vXf64 support.
if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
return SDValue();
// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
return SDValue();
SDValue N0 = V.getOperand(0);
SDValue N1 = V.getOperand(1);
unsigned Imm = V.getConstantOperandVal(2);
if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
return SDValue();
Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
DAG.getTargetConstant(Imm, DL, MVT::i8));
switch (N.getOpcode()) {
if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
unsigned Imm = N.getConstantOperandVal(1);
return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
case X86ISD::SHUFP: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
unsigned Imm = N.getConstantOperandVal(2);
if (N0 == N1) {
if (SDValue NewSHUFP = commuteSHUFP(N, N0))
return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
return SDValue();
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
bool IsUnary;
SmallVector<int, 64> TargetMask;
SmallVector<SDValue, 2> TargetOps;
if (isTargetShuffle(Opcode))
getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
// single instruction. Attempt to match a v2X64 repeating shuffle pattern that
// represents the LHS/RHS inputs for the lower/upper halves.
SmallVector<int, 16> TargetMask128;
if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
SmallVector<int, 16> WidenedMask128 = TargetMask128;
while (WidenedMask128.size() > 2) {
SmallVector<int, 16> WidenedMask;
if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
WidenedMask128 = std::move(WidenedMask);
if (WidenedMask128.size() == 2) {
assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
SDValue BC0 = peekThroughBitcasts(TargetOps.front());
SDValue BC1 = peekThroughBitcasts(TargetOps.back());
EVT VT0 = BC0.getValueType();
EVT VT1 = BC1.getValueType();
unsigned Opcode0 = BC0.getOpcode();
unsigned Opcode1 = BC1.getOpcode();
bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
if (Opcode0 == Opcode1 && VT0 == VT1 &&
(isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
bool SingleOp = (TargetOps.size() == 1);
if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
Lo = Lo.getOperand(WidenedMask128[0] & 1);
Hi = Hi.getOperand(WidenedMask128[1] & 1);
if (SingleOp) {
MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
SDValue Undef = DAG.getUNDEF(SrcVT);
SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
return DAG.getBitcast(VT, Horiz);
if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
return R;
// Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
// help expose the 'NOT' pattern further up the DAG.
// TODO: This might be beneficial for any binop with a 'splattable' operand.
switch (Opcode) {
case X86ISD::PSHUFD: {
SDValue Src = N.getOperand(0);
if (Src.hasOneUse() && Src.getValueType() == VT) {
if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
Not = DAG.getBitcast(VT, Not);
Not = Opcode == X86ISD::MOVDDUP
? DAG.getNode(Opcode, DL, VT, Not)
: DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
EVT IntVT = Not.getValueType().changeTypeToInteger();
SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
Not = DAG.getBitcast(IntVT, Not);
Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
return DAG.getBitcast(VT, Not);
// Handle specific target shuffles.
switch (Opcode) {
case X86ISD::MOVDDUP: {
SDValue Src = N.getOperand(0);
// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
if (VT == MVT::v2f64 && Src.hasOneUse() &&
ISD::isNormalLoad(Src.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src);
if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
DCI.CombineTo(N.getNode(), Movddup);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return N; // Return N so it doesn't get rechecked!
return SDValue();
SDValue Src = N.getOperand(0);
SDValue BC = peekThroughBitcasts(Src);
EVT SrcVT = Src.getValueType();
EVT BCVT = BC.getValueType();
// If broadcasting from another shuffle, attempt to simplify it.
// TODO - we really need a general SimplifyDemandedVectorElts mechanism.
if (isTargetShuffle(BC.getOpcode()) &&
VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
for (unsigned i = 0; i != Scale; ++i)
DemandedMask[i] = i;
if (SDValue Res = combineX86ShufflesRecursively(
{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
// broadcast(bitcast(src)) -> bitcast(broadcast(src))
// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
if (Src.getOpcode() == ISD::BITCAST &&
SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
// Reduce broadcast source vector to lowest 128-bits.
if (SrcVT.getSizeInBits() > 128)
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
extract128BitVector(Src, 0, DAG, DL));
// broadcast(scalar_to_vector(x)) -> broadcast(x).
if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
// Share broadcast with the longest vector and extract low subvector (free).
for (SDNode *User : Src->uses())
if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
User->getValueSizeInBits(0) > VT.getSizeInBits()) {
return extractSubVector(SDValue(User, 0), 0, DAG, DL,
// vbroadcast(scalarload X) -> vbroadcast_load X
// For float loads, extract other uses of the scalar from the broadcast.
if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
ISD::isNormalLoad(Src.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src);
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
// If the load value is used only by N, replace it via CombineTo N.
bool NoReplaceExtract = Src.hasOneUse();
DCI.CombineTo(N.getNode(), BcastLd);
if (NoReplaceExtract) {
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
} else {
SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
DAG.getIntPtrConstant(0, DL));
DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
return N; // Return N so it doesn't get rechecked!
// Due to isTypeDesirableForOp, we won't always shrink a load truncated to
// i16. So shrink it ourselves if we can make a broadcast_load.
if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
assert(Subtarget.hasAVX2() && "Expected AVX2");
SDValue TruncIn = Src.getOperand(0);
// If this is a truncate of a non extending load we can just narrow it to
// use a broadcast_load.
if (ISD::isNormalLoad(TruncIn.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
// Unless its volatile or atomic.
if (LN->isSimple()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue BcastLd = DAG.getMemIntrinsicNode(
LN->getPointerInfo(), LN->getOriginalAlign(),
DCI.CombineTo(N.getNode(), BcastLd);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
return N; // Return N so it doesn't get rechecked!
// If this is a truncate of an i16 extload, we can directly replace it.
if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
ISD::isEXTLoad(Src.getOperand(0).getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
if (LN->getMemoryVT().getSizeInBits() == 16) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
DCI.CombineTo(N.getNode(), BcastLd);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
return N; // Return N so it doesn't get rechecked!
// If this is a truncate of load that has been shifted right, we can
// offset the pointer and use a narrower load.
if (TruncIn.getOpcode() == ISD::SRL &&
TruncIn.getOperand(0).hasOneUse() &&
isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
// Make sure the shift amount and the load size are divisible by 16.
// Don't do this if the load is volatile or atomic.
if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
LN->isSimple()) {
unsigned Offset = ShiftAmt / 8;
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
SDValue Ops[] = { LN->getChain(), Ptr };
SDValue BcastLd = DAG.getMemIntrinsicNode(
DCI.CombineTo(N.getNode(), BcastLd);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
return N; // Return N so it doesn't get rechecked!
// vbroadcast(vzload X) -> vbroadcast_load X
if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
DCI.CombineTo(N.getNode(), BcastLd);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
return N; // Return N so it doesn't get rechecked!
// vbroadcast(vector load X) -> vbroadcast_load
if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
ISD::isNormalLoad(Src.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src);
// Unless the load is volatile or atomic.
if (LN->isSimple()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue BcastLd = DAG.getMemIntrinsicNode(
LN->getPointerInfo(), LN->getOriginalAlign(),
DCI.CombineTo(N.getNode(), BcastLd);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
return N; // Return N so it doesn't get rechecked!
return SDValue();
case X86ISD::VZEXT_MOVL: {
SDValue N0 = N.getOperand(0);
// If this a vzmovl of a full vector load, replace it with a vzload, unless
// the load is volatile.
if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
auto *LN = cast<LoadSDNode>(N0);
if (SDValue VZLoad =
narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
DCI.CombineTo(N.getNode(), VZLoad);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return N;
// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
// and can just use a VZEXT_LOAD.
// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
auto *LN = cast<MemSDNode>(N0);
if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue VZLoad =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
DCI.CombineTo(N.getNode(), VZLoad);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return N;
// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
// if the upper bits of the i64 are zero.
if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
N0.getOperand(0).hasOneUse() &&
N0.getOperand(0).getValueType() == MVT::i64) {
SDValue In = N0.getOperand(0);
APInt Mask = APInt::getHighBitsSet(64, 32);
if (DAG.MaskedValueIsZero(In, Mask)) {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
return DAG.getBitcast(VT, Movl);
// Load a scalar integer constant directly to XMM instead of transferring an
// immediate value from GPR.
// vzext_movl (scalar_to_vector C) --> load [C,0...]
if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
// Create a vector constant - scalar constant followed by zeros.
EVT ScalarVT = N0.getOperand(0).getValueType();
Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
unsigned NumElts = VT.getVectorNumElements();
Constant *Zero = ConstantInt::getNullValue(ScalarTy);
SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
// Load the vector constant from constant pool.
MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
MachinePointerInfo MPI =
Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
return SDValue();
case X86ISD::BLENDI: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
// TODO: Handle MVT::v16i16 repeated blend mask.
if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
MVT SrcVT = N0.getOperand(0).getSimpleValueType();
if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
SrcVT.getScalarSizeInBits() >= 32) {
unsigned BlendMask = N.getConstantOperandVal(2);
unsigned Size = VT.getVectorNumElements();
unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
return SDValue();
case X86ISD::VPERMI: {
// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
if (N0.getOpcode() == ISD::BITCAST &&
N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
SDValue Src = N0.getOperand(0);
EVT SrcVT = Src.getValueType();
SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
return DAG.getBitcast(VT, Res);
return SDValue();
case X86ISD::VPERM2X128: {
// If both 128-bit values were inserted into high halves of 256-bit values,
// the shuffle can be reduced to a concatenation of subvectors:
// vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
// Note: We are only looking for the exact high/high shuffle mask because we
// expect to fold other similar patterns before creating this opcode.
SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
unsigned Imm = N.getConstantOperandVal(2);
if (!(Imm == 0x31 &&
Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
Ins0.getValueType() == Ins1.getValueType()))
return SDValue();
SDValue X = Ins0.getOperand(1);
SDValue Y = Ins1.getOperand(1);
unsigned C1 = Ins0.getConstantOperandVal(2);
unsigned C2 = Ins1.getConstantOperandVal(2);
MVT SrcVT = X.getSimpleValueType();
unsigned SrcElts = SrcVT.getVectorNumElements();
if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
C1 != SrcElts || C2 != SrcElts)
return SDValue();
return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
Ins1.getValueType(), X, Y));
case X86ISD::PSHUFD:
Mask = getPSHUFShuffleMask(N);
assert(Mask.size() == 4);
case X86ISD::MOVSD:
case X86ISD::MOVSS: {
SDValue N0 = N.getOperand(0);
SDValue N1 = N.getOperand(1);
// Canonicalize scalar FPOps:
// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
// If commutable, allow OP(N1[0], N0[0]).
unsigned Opcode1 = N1.getOpcode();
if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
Opcode1 == ISD::FDIV) {
SDValue N10 = N1.getOperand(0);
SDValue N11 = N1.getOperand(1);
if (N10 == N0 ||
(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
if (N10 != N0)
std::swap(N10, N11);
MVT SVT = VT.getVectorElementType();
SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
return DAG.getNode(Opcode, DL, VT, N0, SclVec);
return SDValue();
case X86ISD::INSERTPS: {
assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
SDValue Op0 = N.getOperand(0);
SDValue Op1 = N.getOperand(1);
unsigned InsertPSMask = N.getConstantOperandVal(2);
unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
unsigned ZeroMask = InsertPSMask & 0xF;
// If we zero out all elements from Op0 then we don't need to reference it.
if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// If we zero out the element from Op1 then we don't need to reference it.
if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// Attempt to merge insertps Op1 with an inner target shuffle node.
SmallVector<int, 8> TargetMask1;
SmallVector<SDValue, 2> Ops1;
APInt KnownUndef1, KnownZero1;
if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
KnownZero1)) {
if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
// Zero/UNDEF insertion - zero out element and remove dependency.
InsertPSMask |= (1u << DstIdx);
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// Update insertps mask srcidx and reference the source input directly.
int M = TargetMask1[SrcIdx];
assert(0 <= M && M < 8 && "Shuffle index out of range");
InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
Op1 = Ops1[M < 4 ? 0 : 1];
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// Attempt to merge insertps Op0 with an inner target shuffle node.
SmallVector<int, 8> TargetMask0;
SmallVector<SDValue, 2> Ops0;
APInt KnownUndef0, KnownZero0;
if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
KnownZero0)) {
bool Updated = false;
bool UseInput00 = false;
bool UseInput01 = false;
for (int i = 0; i != 4; ++i) {
if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
// No change if element is already zero or the inserted element.
} else if (KnownUndef0[i] || KnownZero0[i]) {
// If the target mask is undef/zero then we must zero the element.
InsertPSMask |= (1u << i);
Updated = true;
// The input vector element must be inline.
int M = TargetMask0[i];
if (M != i && M != (i + 4))
return SDValue();
// Determine which inputs of the target shuffle we're using.
UseInput00 |= (0 <= M && M < 4);
UseInput01 |= (4 <= M);
// If we're not using both inputs of the target shuffle then use the
// referenced input directly.
if (UseInput00 && !UseInput01) {
Updated = true;
Op0 = Ops0[0];
} else if (!UseInput00 && UseInput01) {
Updated = true;
Op0 = Ops0[1];
if (Updated)
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// If we're inserting an element from a vbroadcast load, fold the
// load into the X86insertps instruction. We need to convert the scalar
// load to a vector and clear the source lane of the INSERTPS control.
if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
return Insert;
return SDValue();
return SDValue();
// Nuke no-op shuffles that show up after combining.
if (isNoopShuffleMask(Mask))
return N.getOperand(0);
// Look for simplifications involving one or two shuffle instructions.
SDValue V = N.getOperand(0);
switch (N.getOpcode()) {
assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
// See if this reduces to a PSHUFD which is no more expensive and can
// combine with more operations. Note that it has to at least flip the
// dwords as otherwise it would have been removed as a no-op.
if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
int DMask[] = {0, 1, 2, 3};
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
DMask[DOffset + 0] = DOffset + 1;
DMask[DOffset + 1] = DOffset + 0;
MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
V = DAG.getBitcast(DVT, V);
V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
return DAG.getBitcast(VT, V);
// Look for shuffle patterns which can be implemented as a single unpack.
// FIXME: This doesn't handle the location of the PSHUFD generically, and
// only works when we have a PSHUFD followed by two half-shuffles.
if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
(V.getOpcode() == X86ISD::PSHUFLW ||
V.getOpcode() == X86ISD::PSHUFHW) &&
V.getOpcode() != N.getOpcode() &&
V.hasOneUse() && V.getOperand(0).hasOneUse()) {
SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
if (D.getOpcode() == X86ISD::PSHUFD) {
SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
int WordMask[8];
for (int i = 0; i < 4; ++i) {
WordMask[i + NOffset] = Mask[i] + NOffset;
WordMask[i + VOffset] = VMask[i] + VOffset;
// Map the word mask through the DWord mask.
int MappedMask[8];
for (int i = 0; i < 8; ++i)
MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
// We can replace all three shuffles with an unpack.
V = DAG.getBitcast(VT, D.getOperand(0));
return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
DL, VT, V, V);
case X86ISD::PSHUFD:
if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
return NewN;
return SDValue();
/// Checks if the shuffle mask takes subsequent elements
/// alternately from two vectors.
/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
int ParitySrc[2] = {-1, -1};
unsigned Size = Mask.size();
for (unsigned i = 0; i != Size; ++i) {
int M = Mask[i];
if (M < 0)
// Make sure we are using the matching element from the input.
if ((M % Size) != i)
return false;
// Make sure we use the same input for all elements of the same parity.
int Src = M / Size;
if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
return false;
ParitySrc[i % 2] = Src;
// Make sure each input is used.
if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
return false;
Op0Even = ParitySrc[0] == 0;
return true;
/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
/// are written to the parameters \p Opnd0 and \p Opnd1.
/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
/// so it is easier to generically match. We also insert dummy vector shuffle
/// nodes for the operands which explicitly discard the lanes which are unused
/// by this operation to try to flow through the rest of the combiner
/// the fact that they're unused.
static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
bool &IsSubAdd) {
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
return false;
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
// extraction tool to support more.
if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
return false;
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
// Make sure we have an FADD and an FSUB.
if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
V1.getOpcode() == V2.getOpcode())
return false;
// If there are other uses of these operations we can't fold them.
if (!V1->hasOneUse() || !V2->hasOneUse())
return false;
// Ensure that both operations have the same operands. Note that we can
// commute the FADD operands.
if (V1.getOpcode() == ISD::FSUB) {
LHS = V1->getOperand(0); RHS = V1->getOperand(1);
if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
return false;
} else {
assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
LHS = V2->getOperand(0); RHS = V2->getOperand(1);
if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
return false;
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
bool Op0Even;
if (!isAddSubOrSubAddMask(Mask, Op0Even))
return false;
// It's a subadd if the vector in the even parity is an FADD.
IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
: V2->getOpcode() == ISD::FADD;
Opnd0 = LHS;
Opnd1 = RHS;
return true;
/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue combineShuffleToFMAddSub(SDNode *N,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
// extraction tool to support more.
if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
return SDValue();
MVT VT = N->getSimpleValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
return SDValue();
// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDValue FMAdd = Op0, FMSub = Op1;
if (FMSub.getOpcode() != X86ISD::FMSUB)
std::swap(FMAdd, FMSub);
if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
FMAdd.getOperand(2) != FMSub.getOperand(2))
return SDValue();
// Check for correct shuffle mask.
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
bool Op0Even;
if (!isAddSubOrSubAddMask(Mask, Op0Even))
return SDValue();
// FMAddSub takes zeroth operand from FMSub node.
SDLoc DL(N);
bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
/// Try to combine a shuffle into a target-specific add-sub or
/// mul-add-sub node.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
return V;
SDValue Opnd0, Opnd1;
bool IsSubAdd;
if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
return SDValue();
MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
if (IsSubAdd)
return SDValue();
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
// the ADDSUB idiom has been successfully recognized. There are no known
// X86 targets with 512-bit ADDSUB instructions!
if (VT.is512BitVector())
return SDValue();
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
// We are looking for a shuffle where both sources are concatenated with undef
// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
// if we can express this as a single-source shuffle, that's preferable.
static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
return SDValue();
EVT VT = N->getValueType(0);
// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();
if (VT.getVectorElementType() != MVT::i32 &&
VT.getVectorElementType() != MVT::i64 &&
VT.getVectorElementType() != MVT::f32 &&
VT.getVectorElementType() != MVT::f64)
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Check that both sources are concats with undef.
if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
return SDValue();
// Construct the new shuffle mask. Elements from the first source retain their
// index, but elements from the second source no longer need to skip an undef.
SmallVector<int, 8> Mask;
int NumElts = VT.getVectorNumElements();
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
for (int Elt : SVOp->getMask())
Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
SDLoc DL(N);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
/// Eliminate a redundant shuffle of a horizontal math op.
static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
return SDValue();
// For a broadcast, peek through an extract element of index 0 to find the
// horizontal op: broadcast (ext_vec_elt HOp, 0)
EVT VT = N->getValueType(0);
if (Opcode == X86ISD::VBROADCAST) {
SDValue SrcOp = N->getOperand(0);
if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
SrcOp.getValueType() == MVT::f64 &&
SrcOp.getOperand(0).getValueType() == VT &&
N = SrcOp.getNode();
SDValue HOp = N->getOperand(0);
if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
return SDValue();
// 128-bit horizontal math instructions are defined to operate on adjacent
// lanes of each operand as:
// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
// ...similarly for v2f64 and v8i16.
if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
HOp.getOperand(0) != HOp.getOperand(1))
return SDValue();
// The shuffle that we are eliminating may have allowed the horizontal op to
// have an undemanded (undefined) operand. Duplicate the other (defined)
// operand to ensure that the results are defined across all lanes without the
// shuffle.
auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
SDValue X;
if (HorizOp.getOperand(0).isUndef()) {
assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
X = HorizOp.getOperand(1);
} else if (HorizOp.getOperand(1).isUndef()) {
assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
X = HorizOp.getOperand(0);
} else {
return HorizOp;
return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
HorizOp.getValueType(), X, X);
// When the operands of a horizontal math op are identical, the low half of
// the result is the same as the high half. If a target shuffle is also
// replicating low and high halves (and without changing the type/length of
// the vector), we don't need the shuffle.
if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
// movddup (hadd X, X) --> hadd X, X
// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
assert((HOp.getValueType() == MVT::v2f64 ||
HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
return updateHOp(HOp, DAG);
return SDValue();
// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
// but this should be tied to whatever horizontal op matching and shuffle
// canonicalization are producing.
if (HOp.getValueSizeInBits() == 128 &&
(isTargetShuffleEquivalent(Mask, {0, 0}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
return updateHOp(HOp, DAG);
if (HOp.getValueSizeInBits() == 256 &&
(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
return updateHOp(HOp, DAG);
return SDValue();
/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
/// low half of each source vector and does not set any high half elements in
/// the destination vector, narrow the shuffle to half its original size.
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
if (!Shuf->getValueType(0).isSimple())
return SDValue();
MVT VT = Shuf->getSimpleValueType(0);
if (!VT.is256BitVector() && !VT.is512BitVector())
return SDValue();
// See if we can ignore all of the high elements of the shuffle.
ArrayRef<int> Mask = Shuf->getMask();
if (!isUndefUpperHalf(Mask))
return SDValue();
// Check if the shuffle mask accesses only the low half of each input vector
// (half-index output is 0 or 2).
int HalfIdx1, HalfIdx2;
SmallVector<int, 8> HalfMask(Mask.size() / 2);
if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
return SDValue();
// Create a half-width shuffle to replace the unnecessarily wide shuffle.
// The trick is knowing that all of the insert/extract are actually free
// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
// of narrow inputs into a narrow output, and that is always cheaper than
// the wide shuffle that we started with.
return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
Shuf->getOperand(1), HalfMask, HalfIdx1,
HalfIdx2, false, DAG, /*UseConcat*/true);
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
if (SDValue V = narrowShuffle(Shuf, DAG))
return V;
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
SDLoc dl(N);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(VT)) {
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
return HAddSub;
// Attempt to combine into a vector load/broadcast.
if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
Subtarget, true))
return LD;
// For AVX2, we sometimes want to combine
// (vector_shuffle <mask> (concat_vectors t1, undef)
// (concat_vectors t2, undef))
// Into:
// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
// Since the latter can be efficiently lowered with VPERMD/VPERMQ
if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
return ShufConcat;
if (isTargetShuffle(N->getOpcode())) {
SDValue Op(N, 0);
if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
return Shuffle;
// Try recursively combining arbitrary sequences of x86 shuffle
// instructions into higher-order shuffles. We do this after combining
// specific PSHUF instruction sequences into their minimal form so that we
// can evaluate how many specialized shuffle instructions are involved in
// a particular chain.
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
// Simplify source operands based on shuffle mask.
// TODO - merge this into combineX86ShufflesRecursively.
APInt KnownUndef, KnownZero;
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
return SDValue(N, 0);
// Pull subvector inserts into undef through VZEXT_MOVL by making it an
// insert into a zero vector. This helps get VZEXT_MOVL closer to
// scalar_to_vectors where 256/512 are canonicalized to an insert and a
// 128-bit scalar_to_vector. This reduces the number of isel patterns.
if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
N->getOperand(0).hasOneUse()) {
SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));
if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {
SDValue In = V.getOperand(1);
In.getValueSizeInBits() / VT.getScalarSizeInBits());
In = DAG.getBitcast(SubVT, In);
SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
Movl, V.getOperand(2));
return SDValue();
// Simplify variable target shuffle masks based on the demanded elements.
// TODO: Handle DemandedBits in mask indices as well?
bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
// If we're demanding all elements don't bother trying to simplify the mask.
unsigned NumElts = DemandedElts.getBitWidth();
if (DemandedElts.isAllOnesValue())
return false;
SDValue Mask = Op.getOperand(MaskIndex);
if (!Mask.hasOneUse())
return false;
// Attempt to generically simplify the variable shuffle mask.
APInt MaskUndef, MaskZero;
if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
Depth + 1))
return true;
// Attempt to extract+simplify a (constant pool load) shuffle mask.
// TODO: Support other types from getTargetShuffleMaskIndices?
SDValue BC = peekThroughOneUseBitcasts(Mask);
EVT BCVT = BC.getValueType();
auto *Load = dyn_cast<LoadSDNode>(BC);
if (!Load)
return false;
const Constant *C = getTargetConstantFromNode(Load);
if (!C)
return false;
Type *CTy = C->getType();
if (!CTy->isVectorTy() ||
CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
return false;
// Handle scaling for i64 elements on 32-bit targets.
unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
return false;
unsigned Scale = NumCstElts / NumElts;
// Simplify mask if we have an undemanded element that is not undef.
bool Simplified = false;
SmallVector<Constant *, 32> ConstVecOps;
for (unsigned i = 0; i != NumCstElts; ++i) {
Constant *Elt = C->getAggregateElement(i);
if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
Simplified = true;
if (!Simplified)
return false;
// Generate new constant pool entry + legalize immediately for the load.
SDLoc DL(Op);
SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
SDValue NewMask = TLO.DAG.getLoad(
BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth) const {
int NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
// Handle special case opcodes.
switch (Opc) {
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ: {
APInt LHSUndef, LHSZero;
APInt RHSUndef, RHSZero;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
Depth + 1))
return true;
if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
Depth + 1))
return true;
// Multiply by zero.
KnownZero = LHSZero | RHSZero;
case X86ISD::VSHL:
case X86ISD::VSRL:
case X86ISD::VSRA: {
// We only need the bottom 64-bits of the (128-bit) shift amount.
SDValue Amt = Op.getOperand(1);
MVT AmtVT = Amt.getSimpleValueType();
assert(AmtVT.is128BitVector() && "Unexpected value type");
// If we reuse the shift amount just for sse shift amounts then we know that
// only the bottom 64-bits are only ever used.
bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
unsigned UseOpc = Use->getOpcode();
return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
UseOpc == X86ISD::VSRA) &&
Use->getOperand(0) != Amt;
APInt AmtUndef, AmtZero;
unsigned NumAmtElts = AmtVT.getVectorNumElements();
APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
Depth + 1, AssumeSingleUse))
return true;
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
APInt SrcUndef;
if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
Depth + 1))
return true;
// TODO convert SrcUndef to KnownUndef.
case X86ISD::KSHIFTL: {
SDValue Src = Op.getOperand(0);
auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
unsigned ShiftAmt = Amt->getZExtValue();
if (ShiftAmt == 0)
return TLO.CombineTo(Op, Src);
// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
// single shift. We can do this if the bottom bits (which are shifted
// out) are never demanded.
if (Src.getOpcode() == X86ISD::KSHIFTR) {
if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
unsigned C1 = Src.getConstantOperandVal(1);
unsigned NewOpc = X86ISD::KSHIFTL;
int Diff = ShiftAmt - C1;
if (Diff < 0) {
Diff = -Diff;
SDLoc dl(Op);
SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
return TLO.CombineTo(
Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
Depth + 1))
return true;
KnownUndef <<= ShiftAmt;
KnownZero <<= ShiftAmt;
case X86ISD::KSHIFTR: {
SDValue Src = Op.getOperand(0);
auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
unsigned ShiftAmt = Amt->getZExtValue();
if (ShiftAmt == 0)
return TLO.CombineTo(Op, Src);
// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
// single shift. We can do this if the top bits (which are shifted
// out) are never demanded.
if (Src.getOpcode() == X86ISD::KSHIFTL) {
if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
unsigned C1 = Src.getConstantOperandVal(1);
unsigned NewOpc = X86ISD::KSHIFTR;
int Diff = ShiftAmt - C1;
if (Diff < 0) {
Diff = -Diff;
SDLoc dl(Op);
SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
return TLO.CombineTo(
Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
Depth + 1))
return true;
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
APInt SrcUndef, SrcZero;
APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
// Aggressively peek through ops to get at the demanded elts.
// TODO - we should do this for all target/faux shuffles ops.
if (!DemandedElts.isAllOnesValue()) {
SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
TLO.DAG, Depth + 1);
SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
TLO.DAG, Depth + 1);
if (NewN0 || NewN1) {
NewN0 = NewN0 ? NewN0 : N0;
NewN1 = NewN1 ? NewN1 : N1;
return TLO.CombineTo(Op,
TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
case X86ISD::HADD:
case X86ISD::HSUB:
case X86ISD::FHADD:
case X86ISD::FHSUB: {
APInt DemandedLHS, DemandedRHS;
getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt LHSUndef, LHSZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
LHSZero, TLO, Depth + 1))
return true;
APInt RHSUndef, RHSZero;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
RHSZero, TLO, Depth + 1))
return true;
case X86ISD::VTRUNC:
case X86ISD::VTRUNCUS: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
KnownZero = SrcZero.zextOrTrunc(NumElts);
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
case X86ISD::BLENDV: {
APInt SelUndef, SelZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
SelZero, TLO, Depth + 1))
return true;
// TODO: Use SelZero to adjust LHS/RHS DemandedElts.
APInt LHSUndef, LHSZero;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
LHSZero, TLO, Depth + 1))
return true;
APInt RHSUndef, RHSZero;
if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
RHSZero, TLO, Depth + 1))
return true;
KnownZero = LHSZero & RHSZero;
KnownUndef = LHSUndef & RHSUndef;
case X86ISD::VZEXT_MOVL: {
// If upper demanded elements are already zero then we have nothing to do.
SDValue Src = Op.getOperand(0);
APInt DemandedUpperElts = DemandedElts;
if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
return TLO.CombineTo(Op, Src);
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
if (!SrcVT.isVector())
return false;
// Don't bother broadcasting if we just need the 0'th element.
if (DemandedElts == 1) {
if (Src.getValueType() != VT)
Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
return TLO.CombineTo(Op, Src);
APInt SrcUndef, SrcZero;
APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
// Aggressively peek through src to get at the demanded elt.
// TODO - we should do this for all target/faux shuffles ops.
if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
Src, SrcElts, TLO.DAG, Depth + 1))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
case X86ISD::VPERMV:
if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
return true;
case X86ISD::PSHUFB:
case X86ISD::VPERMV3:
if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
return true;
case X86ISD::VPPERM:
case X86ISD::VPERMIL2:
if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
return true;
// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
if ((VT.is256BitVector() || VT.is512BitVector()) &&
DemandedElts.lshr(NumElts / 2) == 0) {
unsigned SizeInBits = VT.getSizeInBits();
unsigned ExtSizeInBits = SizeInBits / 2;
// See if 512-bit ops only use the bottom 128-bits.
if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
ExtSizeInBits = SizeInBits / 4;
switch (Opc) {
// Subvector broadcast.
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
if (Src.getValueSizeInBits() > ExtSizeInBits)
Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
else if (Src.getValueSizeInBits() < ExtSizeInBits) {
MVT SrcSVT = Src.getSimpleValueType().getScalarType();
MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
TLO.DAG, DL, ExtSizeInBits));
// Byte shifts by immediate.
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
// Shift by uniform.
case X86ISD::VSHL:
case X86ISD::VSRL:
case X86ISD::VSRA:
// Shift by immediate.
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI: {
SDLoc DL(Op);
SDValue Ext0 =
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
SDValue ExtOp =
TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert =
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
return TLO.CombineTo(Op, Insert);
case X86ISD::VPERMI: {
// Simplify PERMPD/PERMQ to extract_subvector.
// TODO: This should be done in shuffle combining.
if (VT == MVT::v4f64 || VT == MVT::v4i64) {
SmallVector<int, 4> Mask;
DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
SDLoc DL(Op);
SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
return TLO.CombineTo(Op, Insert);
// Zero upper elements.
// Target unary shuffles by immediate:
case X86ISD::PSHUFD:
// (Non-Lane Crossing) Target Shuffles.
case X86ISD::VPERMIL2:
case X86ISD::PSHUFB:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::BLENDI:
// Saturated Packs.
case X86ISD::PACKSS:
case X86ISD::PACKUS:
// Horizontal Ops.
case X86ISD::HADD:
case X86ISD::HSUB:
case X86ISD::FHADD:
case X86ISD::FHSUB: {
SDLoc DL(Op);
SmallVector<SDValue, 4> Ops;
for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
SDValue SrcOp = Op.getOperand(i);
EVT SrcVT = SrcOp.getValueType();
assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
"Unsupported vector size");
Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
: SrcOp);
MVT ExtVT = VT.getSimpleVT();
ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
ExtSizeInBits / ExtVT.getScalarSizeInBits());
SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert =
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
return TLO.CombineTo(Op, Insert);
// Get target/faux shuffle mask.
APInt OpUndef, OpZero;
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
OpZero, TLO.DAG, Depth, false))
return false;
// Shuffle inputs must be the same size as the result.
if (OpMask.size() != (unsigned)NumElts ||
llvm::any_of(OpInputs, [VT](SDValue V) {
return VT.getSizeInBits() != V.getValueSizeInBits() ||
return false;
KnownZero = OpZero;
KnownUndef = OpUndef;
// Check if shuffle mask can be simplified to undef/zero/identity.
int NumSrcs = OpInputs.size();
for (int i = 0; i != NumElts; ++i)
if (!DemandedElts[i])
OpMask[i] = SM_SentinelUndef;
if (isUndefInRange(OpMask, 0, NumElts)) {
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
return TLO.CombineTo(
Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
for (int Src = 0; Src != NumSrcs; ++Src)
if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
// Attempt to simplify inputs.
for (int Src = 0; Src != NumSrcs; ++Src) {
// TODO: Support inputs of different types.
if (OpInputs[Src].getValueType() != VT)
int Lo = Src * NumElts;
APInt SrcElts = APInt::getNullValue(NumElts);
for (int i = 0; i != NumElts; ++i)
if (DemandedElts[i]) {
int M = OpMask[i] - Lo;
if (0 <= M && M < NumElts)
// TODO - Propagate input undef/zero elts.
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
TLO, Depth + 1))
return true;
// If we don't demand all elements, then attempt to combine to a simpler
// shuffle.
// TODO: Handle other depths, but first we need to handle the fact that
// it might combine to the same shuffle.
if (!DemandedElts.isAllOnesValue() && Depth == 0) {
SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i)
if (DemandedElts[i])
DemandedMask[i] = i;
SDValue NewShuffle = combineX86ShufflesRecursively(
{Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
/*AllowVarMask*/ true, TLO.DAG, Subtarget);
if (NewShuffle)
return TLO.CombineTo(Op, NewShuffle);
return false;
bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue Op, const APInt &OriginalDemandedBits,
const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
unsigned Depth) const {
EVT VT = Op.getValueType();
unsigned BitWidth = OriginalDemandedBits.getBitWidth();
unsigned Opc = Op.getOpcode();
switch(Opc) {
case X86ISD::VTRUNC: {
KnownBits KnownOp;
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
// Simplify the input, using demanded bit information.
APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
return true;
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ: {
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
KnownBits KnownOp;
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
// FIXME: Can we bound this better?
APInt DemandedMask = APInt::getLowBitsSet(64, 32);
if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
TLO, Depth + 1))
return true;
if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
TLO, Depth + 1))
return true;
// Aggressively peek through ops to get at the demanded low bits.
SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
if (DemandedLHS || DemandedRHS) {
DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
case X86ISD::VSHLI: {
SDValue Op0 = Op.getOperand(0);
unsigned ShAmt = Op.getConstantOperandVal(1);
if (ShAmt >= BitWidth)
APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
// single shift. We can do this if the bottom bits (which are shifted
// out) are never demanded.
if (Op0.getOpcode() == X86ISD::VSRLI &&
OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
unsigned Shift2Amt = Op0.getConstantOperandVal(1);
if (Shift2Amt < BitWidth) {
int Diff = ShAmt - Shift2Amt;
if (Diff == 0)
return TLO.CombineTo(Op, Op0.getOperand(0));
unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
SDValue NewShift = TLO.DAG.getNode(
NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
return TLO.CombineTo(Op, NewShift);
// If we are only demanding sign bits then we can use the shift source directly.
unsigned NumSignBits =
TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
unsigned UpperDemandedBits =
BitWidth - OriginalDemandedBits.countTrailingZeros();
if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
return TLO.CombineTo(Op, Op0);
if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
TLO, Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
Known.Zero <<= ShAmt;
Known.One <<= ShAmt;
// Low bits known zero.
case X86ISD::VSRLI: {
unsigned ShAmt = Op.getConstantOperandVal(1);
if (ShAmt >= BitWidth)
APInt DemandedMask = OriginalDemandedBits << ShAmt;
if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
OriginalDemandedElts, Known, TLO, Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
// High bits known zero.
case X86ISD::VSRAI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
if (ShAmt >= BitWidth)
APInt DemandedMask = OriginalDemandedBits << ShAmt;
// If we just want the sign bit then we don't need to shift it.
if (OriginalDemandedBits.isSignMask())
return TLO.CombineTo(Op, Op0);
// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
if (Op0.getOpcode() == X86ISD::VSHLI &&
Op.getOperand(1) == Op0.getOperand(1)) {
SDValue Op00 = Op0.getOperand(0);
unsigned NumSignBits =
TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
if (ShAmt < NumSignBits)
return TLO.CombineTo(Op, Op00);
// If any of the demanded bits are produced by the sign extension, we also
// demand the input sign bit.
if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
TLO, Depth + 1))
return true;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
// If the input sign bit is known to be zero, or if none of the top bits
// are demanded, turn this into an unsigned shift right.
if (Known.Zero[BitWidth - ShAmt - 1] ||
OriginalDemandedBits.countLeadingZeros() >= ShAmt)
return TLO.CombineTo(
Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
// High bits are known one.
if (Known.One[BitWidth - ShAmt - 1])
case X86ISD::PEXTRB:
case X86ISD::PEXTRW: {
SDValue Vec = Op.getOperand(0);
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
MVT VecVT = Vec.getSimpleValueType();
unsigned NumVecElts = VecVT.getVectorNumElements();
if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
unsigned Idx = CIdx->getZExtValue();
unsigned VecBitWidth = VecVT.getScalarSizeInBits();
// If we demand no bits from the vector then we must have demanded
// bits from the implict zext - simplify to zero.
APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
if (DemandedVecBits == 0)
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
APInt KnownUndef, KnownZero;
APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
KnownZero, TLO, Depth + 1))
return true;
KnownBits KnownVec;
if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
KnownVec, TLO, Depth + 1))
return true;
if (SDValue V = SimplifyMultipleUseDemandedBits(
Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
Known = KnownVec.zext(BitWidth);
return false;
case X86ISD::PINSRB:
case X86ISD::PINSRW: {
SDValue Vec = Op.getOperand(0);
SDValue Scl = Op.getOperand(1);
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
MVT VecVT = Vec.getSimpleValueType();
if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
unsigned Idx = CIdx->getZExtValue();
if (!OriginalDemandedElts[Idx])
return TLO.CombineTo(Op, Vec);
KnownBits KnownVec;
APInt DemandedVecElts(OriginalDemandedElts);
if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
KnownVec, TLO, Depth + 1))
return true;
KnownBits KnownScl;
unsigned NumSclBits = Scl.getScalarValueSizeInBits();
APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
return true;
KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
Known.One = KnownVec.One & KnownScl.One;
Known.Zero = KnownVec.Zero & KnownScl.Zero;
return false;
case X86ISD::PACKSS:
// PACKSS saturates to MIN/MAX integer values. So if we just want the
// sign bit then we can just ask for the source operands sign bit.
// TODO - add known bits handling.
if (OriginalDemandedBits.isSignMask()) {
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
KnownBits KnownLHS, KnownRHS;
APInt SignMask = APInt::getSignMask(BitWidth * 2);
if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
KnownLHS, TLO, Depth + 1))
return true;
if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
KnownRHS, TLO, Depth + 1))
return true;
// Attempt to avoid multi-use ops if we don't need anything from them.
SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
if (DemandedOp0 || DemandedOp1) {
SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
// iff we only need the sign bit then we can use R directly.
if (OriginalDemandedBits.isSignMask() &&
return TLO.CombineTo(Op, Op.getOperand(1));
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
unsigned SrcBits = SrcVT.getScalarSizeInBits();
unsigned NumElts = SrcVT.getVectorNumElements();
// If we don't need the sign bits at all just return zero.
if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
// Only demand the vector elements of the sign bits we need.
APInt KnownUndef, KnownZero;
APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
TLO, Depth + 1))
return true;
Known.Zero = KnownZero.zextOrSelf(BitWidth);
Known.Zero.setHighBits(BitWidth - NumElts);
// MOVMSK only uses the MSB from each vector element.
KnownBits KnownSrc;
APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
Depth + 1))
return true;
if (KnownSrc.One[SrcBits - 1])
else if (KnownSrc.Zero[SrcBits - 1])
// Attempt to avoid multi-use os if we don't need anything from it.
if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
return false;
case X86ISD::BEXTR: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Only bottom 16-bits of the control bits are required.
if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
// NOTE: SimplifyDemandedBits won't do this for constants.
const APInt &Val1 = Cst1->getAPIntValue();
APInt MaskedVal1 = Val1 & 0xFFFF;
if (MaskedVal1 != Val1) {
SDLoc DL(Op);
return TLO.CombineTo(
Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
TLO.DAG.getConstant(MaskedVal1, DL, VT)));
KnownBits Known1;
APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
return true;
// If the length is 0, replace with 0.
KnownBits LengthBits = Known1.extractBits(8, 8);
if (LengthBits.isZero())
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
return TargetLowering::SimplifyDemandedBitsForTargetNode(
Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
SelectionDAG &DAG, unsigned Depth) const {
int NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
switch (Opc) {
case X86ISD::PINSRB:
case X86ISD::PINSRW: {
// If we don't demand the inserted element, return the base vector.
SDValue Vec = Op.getOperand(0);
auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
MVT VecVT = Vec.getSimpleValueType();
if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
return Vec;
case X86ISD::VSHLI: {
// If we are only demanding sign bits then we can use the shift source
// directly.
SDValue Op0 = Op.getOperand(0);
unsigned ShAmt = Op.getConstantOperandVal(1);
unsigned BitWidth = DemandedBits.getBitWidth();
unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
return Op0;
case X86ISD::VSRAI:
// iff we only need the sign bit then we can use the source directly.
// TODO: generalize where we only demand extended signbits.
if (DemandedBits.isSignMask())
return Op.getOperand(0);
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
// iff we only need the sign bit then we can use R directly.
if (DemandedBits.isSignMask() &&
return Op.getOperand(1);
APInt ShuffleUndef, ShuffleZero;
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 2> ShuffleOps;
if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
// If all the demanded elts are from one operand and are inline,
// then we can use the operand directly.
int NumOps = ShuffleOps.size();
if (ShuffleMask.size() == (unsigned)NumElts &&
llvm::all_of(ShuffleOps, [VT](SDValue V) {
return VT.getSizeInBits() == V.getValueSizeInBits();
})) {
if (DemandedElts.isSubsetOf(ShuffleUndef))
return DAG.getUNDEF(VT);
if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
// Bitmask that indicates which ops have only been accessed 'inline'.
APInt IdentityOp = APInt::getAllOnesValue(NumOps);
for (int i = 0; i != NumElts; ++i) {
int M = ShuffleMask[i];
if (!DemandedElts[i] || ShuffleUndef[i])
int OpIdx = M / NumElts;
int EltIdx = M % NumElts;
if (M < 0 || EltIdx != i) {
IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
if (IdentityOp == 0)
assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected");
if (IdentityOp != 0)
return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
Op, DemandedBits, DemandedElts, DAG, Depth);
// Helper to peek through bitops/setcc to determine size of source vector.
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
switch (Src.getOpcode()) {
case ISD::SETCC:
return Src.getOperand(0).getValueSizeInBits() == Size;
case ISD::AND:
case ISD::XOR:
case ISD::OR:
return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
checkBitcastSrcVectorSize(Src.getOperand(1), Size);
return false;
// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
static unsigned getAltBitOpcode(unsigned Opcode) {
switch(Opcode) {
case ISD::AND: return X86ISD::FAND;
case ISD::OR: return X86ISD::FOR;
case ISD::XOR: return X86ISD::FXOR;
case X86ISD::ANDNP: return X86ISD::FANDN;
llvm_unreachable("Unknown bitwise opcode");
// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
const SDLoc &DL) {
EVT SrcVT = Src.getValueType();
if (SrcVT != MVT::v4i1)
return SDValue();
switch (Src.getOpcode()) {
case ISD::SETCC:
if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
SDValue Op0 = Src.getOperand(0);
if (ISD::isNormalLoad(Op0.getNode()))
return DAG.getBitcast(MVT::v4f32, Op0);
if (Op0.getOpcode() == ISD::BITCAST &&
Op0.getOperand(0).getValueType() == MVT::v4f32)
return Op0.getOperand(0);
case ISD::AND:
case ISD::XOR:
case ISD::OR: {
SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
if (Op0 && Op1)
return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
return SDValue();
// Helper to push sign extension of vXi1 SETCC result through bitops.
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
SDValue Src, const SDLoc &DL) {
switch (Src.getOpcode()) {
case ISD::SETCC:
return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
case ISD::AND:
case ISD::XOR:
case ISD::OR:
return DAG.getNode(
Src.getOpcode(), DL, SExtVT,
signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
llvm_unreachable("Unexpected node type for vXi1 sign extension");
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
// (i16 movmsk (16i8 sext (v16i1 x)))
// before the illegal vector is scalarized on subtargets that don't have legal
// vxi1 types.
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
const SDLoc &DL,
const X86Subtarget &Subtarget) {
EVT SrcVT = Src.getValueType();
if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
// legalization destroys the v4i32 type.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
DAG.getBitcast(MVT::v4f32, V));
return DAG.getZExtOrTrunc(V, DL, VT);
// If the input is a truncate from v16i8 or v32i8 go ahead and use a
// movmskb even with avx512. This will be better than truncating to vXi1 and
// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
// vpcmpeqb/vpcmpgtb.
bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
(Src.getOperand(0).getValueType() == MVT::v16i8 ||
Src.getOperand(0).getValueType() == MVT::v32i8 ||
Src.getOperand(0).getValueType() == MVT::v64i8);
// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
// directly with vpmovmskb/vmovmskps/vmovmskpd.
if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
EVT CmpVT = Src.getOperand(0).getValueType();
EVT EltVT = CmpVT.getVectorElementType();
if (CmpVT.getSizeInBits() <= 256 &&
(EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
PreferMovMsk = true;
// With AVX512 vxi1 types are legal and we prefer using k-regs.
// MOVMSK is supported in SSE2 or later.
if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
return SDValue();
// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
// v8i16 and v16i16.
// For these two cases, we can shuffle the upper element bytes to a
// consecutive sequence at the start of the vector and treat the results as
// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
// for v16i16 this is not the case, because the shuffle is expensive, so we
// avoid sign-extending to this type entirely.
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
bool PropagateSExt = false;
switch (SrcVT.getSimpleVT().SimpleTy) {
return SDValue();
case MVT::v2i1:
SExtVT = MVT::v2i64;
case MVT::v4i1:
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
SExtVT = MVT::v4i64;
PropagateSExt = true;
case MVT::v8i1:
SExtVT = MVT::v8i16;
// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
// sign-extend to a 256-bit operation to match the compare.
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
checkBitcastSrcVectorSize(Src, 512))) {
SExtVT = MVT::v8i32;
PropagateSExt = true;
case MVT::v16i1:
SExtVT = MVT::v16i8;
// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
// it is not profitable to sign-extend to 256-bit because this will
// require an extra cross-lane shuffle which is more expensive than
// truncating the result of the compare to 128-bits.
case MVT::v32i1:
SExtVT = MVT::v32i8;
case MVT::v64i1:
// If we have AVX512F, but not AVX512BW and the input is truncated from
// v64i8 checked earlier. Then split the input and make two pmovmskbs.
if (Subtarget.hasAVX512()) {
if (Subtarget.hasBWI())
return SDValue();
SExtVT = MVT::v64i8;
// Split if this is a <64 x i8> comparison result.
if (checkBitcastSrcVectorSize(Src, 512)) {
SExtVT = MVT::v64i8;
return SDValue();
SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
V = getPMOVMSKB(DL, V, DAG, Subtarget);
} else {
if (SExtVT == MVT::v8i16)
V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
V = DAG.getZExtOrTrunc(V, DL, IntVT);
return DAG.getBitcast(VT, V);
// Convert a vXi1 constant build vector to the same width scalar integer.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
EVT SrcVT = Op.getValueType();
assert(SrcVT.getVectorElementType() == MVT::i1 &&
"Expected a vXi1 vector");
assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector");
APInt Imm(SrcVT.getVectorNumElements(), 0);
for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
SDValue In = Op.getOperand(Idx);
if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
return DAG.getConstant(Imm, SDLoc(Op), IntVT);
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
if (!DCI.isBeforeLegalizeOps())
return SDValue();
// Only do this if we have k-registers.
if (!Subtarget.hasAVX512())
return SDValue();
EVT DstVT = N->getValueType(0);
SDValue Op = N->getOperand(0);
EVT SrcVT = Op.getValueType();
if (!Op.hasOneUse())
return SDValue();
// Look for logic ops.
if (Op.getOpcode() != ISD::AND &&
Op.getOpcode() != ISD::OR &&
Op.getOpcode() != ISD::XOR)
return SDValue();
// Make sure we have a bitcast between mask registers and a scalar type.
if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
DstVT.isScalarInteger()) &&
!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
return SDValue();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
LHS.getOperand(0).getValueType() == DstVT)
return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
DAG.getBitcast(DstVT, RHS));
if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
RHS.getOperand(0).getValueType() == DstVT)
return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
// If the RHS is a vXi1 build vector, this is a good reason to flip too.
// Most of these have to move a constant from the scalar domain anyway.
if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
RHS = combinevXi1ConstantToInteger(RHS, DAG);
return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
DAG.getBitcast(DstVT, LHS), RHS);
return SDValue();
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NumElts = BV->getNumOperands();
SDValue Splat = BV->getSplatValue();
// Build MMX element from integer GPR or SSE float values.
auto CreateMMXElement = [&](SDValue V) {
if (V.isUndef())
return DAG.getUNDEF(MVT::x86mmx);
if (V.getValueType().isFloatingPoint()) {
if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
V = DAG.getBitcast(MVT::v2i64, V);
return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
V = DAG.getBitcast(MVT::i32, V);
} else {
V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
// Convert build vector ops to MMX data in the bottom elements.
SmallVector<SDValue, 8> Ops;
// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
if (Splat) {
if (Splat.isUndef())
return DAG.getUNDEF(MVT::x86mmx);
Splat = CreateMMXElement(Splat);
if (Subtarget.hasSSE1()) {
// Unpack v8i8 to splat i8 elements to lowest 16-bits.
if (NumElts == 8)
Splat = DAG.getNode(
DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
// Use PSHUFW to repeat 16-bit elements.
unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
return DAG.getNode(
DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
Ops.append(NumElts, Splat);
} else {
for (unsigned i = 0; i != NumElts; ++i)
// Use tree of PUNPCKLs to build up general MMX vector.
while (Ops.size() > 1) {
unsigned NumOps = Ops.size();
unsigned IntrinOp =
(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
: Intrinsic::x86_mmx_punpcklbw));
SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
for (unsigned i = 0; i != NumOps; i += 2)
Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
Ops[i], Ops[i + 1]);
Ops.resize(NumOps / 2);
return Ops[0];
// Recursive function that attempts to find if a bool vector node was originally
// a vector/float/double that got truncated/extended/bitcast to/from a scalar
// integer. If so, replace the scalar ops with bool vector equivalents back down
// the chain.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned Opc = V.getOpcode();
switch (Opc) {
case ISD::BITCAST: {
// Bitcast from a vector/float/double, we can cheaply bitcast to VT.
SDValue Src = V.getOperand(0);
EVT SrcVT = Src.getValueType();
if (SrcVT.isVector() || SrcVT.isFloatingPoint())
return DAG.getBitcast(VT, Src);
// If we find a suitable source, a truncated scalar becomes a subvector.
SDValue Src = V.getOperand(0);
EVT NewSrcVT =
EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
if (TLI.isTypeLegal(NewSrcVT))
if (SDValue N0 =
combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
DAG.getIntPtrConstant(0, DL));
// If we find a suitable source, an extended scalar becomes a subvector.
SDValue Src = V.getOperand(0);
EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
if (TLI.isTypeLegal(NewSrcVT))
if (SDValue N0 =
combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
: DAG.getConstant(0, DL, VT),
N0, DAG.getIntPtrConstant(0, DL));
case ISD::OR: {
// If we find suitable sources, we can just move an OR to the vector domain.
SDValue Src0 = V.getOperand(0);
SDValue Src1 = V.getOperand(1);
if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
return DAG.getNode(Opc, DL, VT, N0, N1);
case ISD::SHL: {
// If we find a suitable source, a SHL becomes a KSHIFTL.
SDValue Src0 = V.getOperand(0);
if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
return DAG.getNode(
DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
return SDValue();
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SrcVT = N0.getValueType();
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
// (i16 movmsk (16i8 sext (v16i1 x)))
// before the setcc result is scalarized on subtargets that don't have legal
// vxi1 types.
if (DCI.isBeforeLegalize()) {
SDLoc dl(N);
if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
Subtarget.hasAVX512()) {
N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
N0 = DAG.getBitcast(MVT::v8i1, N0);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
DAG.getIntPtrConstant(0, dl));
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
Subtarget.hasAVX512()) {
// Use zeros for the widening if we already have some zeroes. This can
// allow SimplifyDemandedBits to remove scalar ANDs that may be down
// stream of this.
// FIXME: It might make sense to detect a concat_vectors with a mix of
// zeroes and undef and turn it into insert_subvector for i1 vectors as
// a separate combine. What we can't do is canonicalize the operands of
// such a concat or we'll get into a loop with SimplifyDemandedBits.
if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
SrcVT = LastOp.getValueType();
unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
N0 = DAG.getBitcast(MVT::i8, N0);
return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
Ops[0] = N0;
N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
N0 = DAG.getBitcast(MVT::i8, N0);
return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
} else {
// If we're bitcasting from iX to vXi1, see if the integer originally
// began as a vXi1 and whether we can remove the bitcast entirely.
if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
SrcVT.isScalarInteger() &&
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
if (SDValue V =
combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
return V;
// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
// due to insert_subvector legalization on KNL. By promoting the copy to i16
// we can help with known bits propagation from the vXi1 domain to the
// scalar domain.
if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N0.getOperand(0).getValueType() == MVT::v16i1 &&
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
DAG.getBitcast(MVT::i16, N0.getOperand(0)));
// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
// and the vbroadcast_load are both integer or both fp. In some cases this
// will remove the bitcast entirely.
if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
auto *BCast = cast<MemIntrinsicSDNode>(N0);
unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
// Don't swap i8/i16 since don't have fp types that size.
if (MemSize >= 32) {
MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
: MVT::getIntegerVT(MemSize);
MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
: MVT::getIntegerVT(SrcVTSize);
LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
MemVT, BCast->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
return DAG.getBitcast(VT, ResNode);
// Since MMX types are special and don't usually play with other vector types,
// it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions.
if (VT == MVT::x86mmx) {
// Detect MMX constant vectors.
APInt UndefElts;
SmallVector<APInt, 1> EltBits;
if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
SDLoc DL(N0);
// Handle zero-extension of i32 with MOVD.
if (EltBits[0].countLeadingZeros() >= 32)
return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
// Else, bitcast to a double.
// TODO - investigate supporting sext 32-bit immediates on x86_64.
APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
// Detect bitcasts to x86mmx low word.
if (N0.getOpcode() == ISD::BUILD_VECTOR &&
(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
bool LowUndef = true, AllUndefOrZero = true;
for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
SDValue Op = N0.getOperand(i);
LowUndef &= Op.isUndef() || (i >= e/2);
AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
if (AllUndefOrZero) {
SDValue N00 = N0.getOperand(0);
SDLoc dl(N00);
N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
: DAG.getZExtOrTrunc(N00, dl, MVT::i32);
return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
// Detect bitcasts of 64-bit build vectors and convert to a
// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
// lowest element.
if (N0.getOpcode() == ISD::BUILD_VECTOR &&
(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
SrcVT == MVT::v8i8))
return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
// Detect bitcasts between element or subvector extraction to x86mmx.
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
isNullConstant(N0.getOperand(1))) {
SDValue N00 = N0.getOperand(0);
if (N00.getValueType().is128BitVector())
return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
DAG.getBitcast(MVT::v2i64, N00));
// Detect bitcasts from FP_TO_SINT to x86mmx.
if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
SDLoc DL(N0);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
DAG.getBitcast(MVT::v2i64, Res));
// Try to remove a bitcast of constant vXi1 vector. We have to legalize
// most of these to scalar anyway.
if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
return combinevXi1ConstantToInteger(N0, DAG);
if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
isa<ConstantSDNode>(N0)) {
auto *C = cast<ConstantSDNode>(N0);
if (C->isAllOnesValue())
return DAG.getConstant(1, SDLoc(N0), VT);
if (C->isNullValue())
return DAG.getConstant(0, SDLoc(N0), VT);
// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
// Turn it into a sign bit compare that produces a k-register. This avoids
// a trip through a GPR.
if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
isPowerOf2_32(VT.getVectorNumElements())) {
unsigned NumElts = VT.getVectorNumElements();
SDValue Src = N0;
// Peek through truncate.
if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
Src = N0.getOperand(0);
if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
SDValue MovmskIn = Src.getOperand(0);
MVT MovmskVT = MovmskIn.getSimpleValueType();
unsigned MovMskElts = MovmskVT.getVectorNumElements();
// We allow extra bits of the movmsk to be used since they are known zero.
// We can't convert a VPMOVMSKB without avx512bw.
if (MovMskElts <= NumElts &&
(Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
SDLoc dl(N);
MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
DAG.getConstant(0, dl, IntVT), ISD::SETLT);
if (EVT(CmpVT) == VT)
return Cmp;
// Pad with zeroes up to original VT to replace the zeroes that were
// being used from the MOVMSK.
unsigned NumConcats = NumElts / MovMskElts;
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
Ops[0] = Cmp;
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
// Try to remove bitcasts from input and output of mask arithmetic to
// remove GPR<->K-register crossings.
if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
return V;
// Convert a bitcasted integer logic operation that has one bitcasted
// floating-point operand into a floating-point logic operation. This may
// create a load of a constant, but that is cheaper than materializing the
// constant in an integer register and transferring it to an SSE register or
// transferring the SSE operand to integer register and back.
unsigned FPOpcode;
switch (N0.getOpcode()) {
case ISD::AND: FPOpcode = X86ISD::FAND; break;
case ISD::OR: FPOpcode = X86ISD::FOR; break;
case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
default: return SDValue();
if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
(Subtarget.hasSSE2() && VT == MVT::f64)))
return SDValue();
SDValue LogicOp0 = N0.getOperand(0);
SDValue LogicOp1 = N0.getOperand(1);
SDLoc DL0(N0);
// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
return SDValue();
// Given a ABS node, detect the following pattern:
// This is useful as it is the input into a SAD pattern.
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
SDValue AbsOp1 = Abs->getOperand(0);
if (AbsOp1.getOpcode() != ISD::SUB)
return false;
Op0 = AbsOp1.getOperand(0);
Op1 = AbsOp1.getOperand(1);
// Check if the operands of the sub are zero-extended from vectors of i8.
if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
Op1.getOpcode() != ISD::ZERO_EXTEND ||
Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
return false;
return true;
// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
// to these zexts.
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
const SDValue &Zext1, const SDLoc &DL,
const X86Subtarget &Subtarget) {
// Find the appropriate width for the PSADBW.
EVT InVT = Zext0.getOperand(0).getValueType();
unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
// fill in the missing vector elements with 0.
unsigned NumConcat = RegSize / InVT.getSizeInBits();
SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
Ops[0] = Zext0.getOperand(0);
MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
Ops[0] = Zext1.getOperand(0);
SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Bail without SSE41.
if (!Subtarget.hasSSE41())
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
return SDValue();
// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Src = DAG.matchBinOpReduction(
Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
if (!Src)
return SDValue();
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getScalarType();
if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
return SDValue();
SDLoc DL(Extract);
SDValue MinPos = Src;
// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
while (SrcVT.getSizeInBits() > 128) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
SrcVT = Lo.getValueType();
MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type");
// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
// to flip the value accordingly.
SDValue Mask;
unsigned MaskEltsBits = ExtractVT.getSizeInBits();
if (BinOp == ISD::SMAX)
Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
else if (BinOp == ISD::SMIN)
Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
else if (BinOp == ISD::UMAX)
Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
// For v16i8 cases we need to perform UMIN on pairs of byte elements,
// shuffling each upper element down and insert zeros. This means that the
// v16i8 UMIN will leave the upper element as zero, performing zero-extension
// ready for the PHMINPOS.
if (ExtractVT == MVT::i8) {
SDValue Upper = DAG.getVectorShuffle(
SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
// Perform the PHMINPOS on a v8i16 vector,
MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
MinPos = DAG.getBitcast(SrcVT, MinPos);
if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
DAG.getIntPtrConstant(0, DL));
// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Bail without SSE2.
if (!Subtarget.hasSSE2())
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
unsigned BitWidth = ExtractVT.getSizeInBits();
if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
return SDValue();
// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
if (!Match && ExtractVT == MVT::i1)
Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
if (!Match)
return SDValue();
// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
// which we can't support here for now.
if (Match.getScalarValueSizeInBits() != BitWidth)
return SDValue();
SDValue Movmsk;
SDLoc DL(Extract);
EVT MatchVT = Match.getValueType();
unsigned NumElts = MatchVT.getVectorNumElements();
unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (ExtractVT == MVT::i1) {
// Special case for (pre-legalization) vXi1 reductions.
if (NumElts > 64 || !isPowerOf2_32(NumElts))
return SDValue();
if (TLI.isTypeLegal(MatchVT)) {
// If this is a legal AVX512 predicate type then we can just bitcast.
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = DAG.getBitcast(MovmskVT, Match);
} else {
// For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
// PCMPEQQ (SSE41+), use PCMPEQD instead.
if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
Match.getOpcode() == ISD::SETCC &&
ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
ISD::CondCode::SETEQ) {
SDValue Vec = Match.getOperand(0);
if (Vec.getValueType().getScalarType() == MVT::i64 &&
(2 * NumElts) <= MaxElts) {
NumElts *= 2;
EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
Match = DAG.getSetCC(
DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
// Use combineBitcastvxi1 to create the MOVMSK.
while (NumElts > MaxElts) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
NumElts /= 2;
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
if (!Movmsk)
return SDValue();
Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
} else {
// FIXME: Better handling of k-registers or 512-bit vectors?
unsigned MatchSizeInBits = Match.getValueSizeInBits();
if (!(MatchSizeInBits == 128 ||
(MatchSizeInBits == 256 && Subtarget.hasAVX())))
return SDValue();
// Make sure this isn't a vector of 1 element. The perf win from using
// MOVMSK diminishes with less elements in the reduction, but it is
// generally better to get the comparison over to the GPRs as soon as
// possible to reduce the number of vector ops.
if (Match.getValueType().getVectorNumElements() < 2)
return SDValue();
// Check that we are extracting a reduction of all sign bits.
if (DAG.ComputeNumSignBits(Match) != BitWidth)
return SDValue();
if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
MatchSizeInBits = Match.getValueSizeInBits();
// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
MVT MaskSrcVT;
if (64 == BitWidth || 32 == BitWidth)
MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
MatchSizeInBits / BitWidth);
MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
NumElts = MaskSrcVT.getVectorNumElements();
assert((NumElts <= 32 || NumElts == 64) &&
"Not expecting more than 64 elements");
MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
if (BinOp == ISD::XOR) {
// parity -> (AND (CTPOP(MOVMSK X)), 1)
SDValue Mask = DAG.getConstant(1, DL, CmpVT);
SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
SDValue CmpC;
ISD::CondCode CondCode;
if (BinOp == ISD::OR) {
// any_of -> MOVMSK != 0
CmpC = DAG.getConstant(0, DL, CmpVT);
CondCode = ISD::CondCode::SETNE;
} else {
// all_of -> MOVMSK == ((1 << NumElts) - 1)
CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
DL, CmpVT);
CondCode = ISD::CondCode::SETEQ;
// The setcc produces an i8 of 0/1, so extend that to the result width and
// negate to get the final 0/-1 mask value.
EVT SetccVT =
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// PSADBW is only supported on SSE2 and up.
if (!Subtarget.hasSSE2())
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
// Verify the type we're extracting is either i32 or i64.
// FIXME: Could support other types, but this is what we have coverage for.
if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
return SDValue();
EVT VT = Extract->getOperand(0).getValueType();
if (!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
// Match shuffle + add pyramid.
ISD::NodeType BinOp;
SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
// The operand is expected to be zero extended from i8
// (verified in detectZextAbsDiff).
// In order to convert to i64 and above, additional any/zero/sign
// extend is expected.
// The zero extend from 32 bit has no mathematical effect on the result.
// Also the sign extend is basically zero extend
// (extends the sign bit which is zero).
// So it is correct to skip the sign/zero extend instruction.
if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
Root.getOpcode() == ISD::ZERO_EXTEND ||
Root.getOpcode() == ISD::ANY_EXTEND))
Root = Root.getOperand(0);
// If there was a match, we want Root to be a select that is the root of an
// abs-diff pattern.
if (!Root || Root.getOpcode() != ISD::ABS)
return SDValue();
// Check whether we have an abs-diff pattern feeding into the select.
SDValue Zext0, Zext1;
if (!detectZextAbsDiff(Root, Zext0, Zext1))
return SDValue();
// Create the SAD instruction.
SDLoc DL(Extract);
SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
// If the original vector was wider than 8 elements, sum over the results
// in the SAD vector.
unsigned Stages = Log2_32(VT.getVectorNumElements());
EVT SadVT = SAD.getValueType();
if (Stages > 3) {
unsigned SadElems = SadVT.getVectorNumElements();
for(unsigned i = Stages - 3; i > 0; --i) {
SmallVector<int, 16> Mask(SadElems, -1);
for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
Mask[j] = MaskEnd + j;
SDValue Shuffle =
DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
// Return the lowest ExtractSizeInBits bits.
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
SadVT.getSizeInBits() / ExtractSizeInBits);
SAD = DAG.getBitcast(ResVT, SAD);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
// Attempt to peek through a target shuffle and extract the scalar from the
// source.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SDLoc dl(N);
SDValue Src = N->getOperand(0);
SDValue Idx = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getVectorElementType();
unsigned SrcEltBits = SrcSVT.getSizeInBits();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
// Don't attempt this for boolean mask vectors or unknown extraction indices.
if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
return SDValue();
const APInt &IdxC = N->getConstantOperandAPInt(1);
if (IdxC.uge(NumSrcElts))
return SDValue();
SDValue SrcBC = peekThroughBitcasts(Src);
// Handle extract(bitcast(broadcast(scalar_value))).
if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
SDValue SrcOp = SrcBC.getOperand(0);
EVT SrcOpVT = SrcOp.getValueType();
if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
// TODO support non-zero offsets.
if (Offset == 0) {
SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
return SrcOp;
// If we're extracting a single element from a broadcast load and there are
// no other users, just create a single load.
if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
return Load;
// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
// TODO: Move to DAGCombine?
if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
SrcBC.getValueType().isInteger() &&
(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
SrcBC.getScalarValueSizeInBits() ==
SrcBC.getOperand(0).getValueSizeInBits()) {
unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
if (IdxC.ult(Scale)) {
unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
SDValue Scl = SrcBC.getOperand(0);
EVT SclVT = Scl.getValueType();
if (Offset) {
Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
DAG.getShiftAmountConstant(Offset, SclVT, dl));
Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
return Scl;
// Handle extract(truncate(x)) for 0'th index.
// TODO: Treat this as a faux shuffle?
// TODO: When can we use this for general indices?
if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
Src = DAG.getBitcast(SrcVT, Src);
return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
// Shuffle inputs must be the same size as the result.
if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
return SDValue();
// Attempt to narrow/widen the shuffle mask to the correct size.
if (Mask.size() != NumSrcElts) {
if ((NumSrcElts % Mask.size()) == 0) {
SmallVector<int, 16> ScaledMask;
int Scale = NumSrcElts / Mask.size();
narrowShuffleMaskElts(Scale, Mask, ScaledMask);
Mask = std::move(ScaledMask);
} else if ((Mask.size() % NumSrcElts) == 0) {
// Simplify Mask based on demanded element.
int ExtractIdx = (int)N->getConstantOperandVal(1);
int Scale = Mask.size() / NumSrcElts;
int Lo = Scale * ExtractIdx;
int Hi = Scale * (ExtractIdx + 1);
for (int i = 0, e = (int)Mask.size(); i != e; ++i)
if (i < Lo || Hi <= i)
Mask[i] = SM_SentinelUndef;
SmallVector<int, 16> WidenedMask;
while (Mask.size() > NumSrcElts &&
canWidenShuffleElements(Mask, WidenedMask))
Mask = std::move(WidenedMask);
// TODO - investigate support for wider shuffle masks with known upper
// undef/zero elements for implicit zero-extension.
// Check if narrowing/widening failed.
if (Mask.size() != NumSrcElts)
return SDValue();
int SrcIdx = Mask[IdxC.getZExtValue()];
// If the shuffle source element is undef/zero then we can just accept it.
if (SrcIdx == SM_SentinelUndef)
return DAG.getUNDEF(VT);
if (SrcIdx == SM_SentinelZero)
return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
: DAG.getConstant(0, dl, VT);
SDValue SrcOp = Ops[SrcIdx / Mask.size()];
SrcIdx = SrcIdx % Mask.size();
// We can only extract other elements from 128-bit vectors and in certain
// circumstances, depending on SSE-level.
// TODO: Investigate using extract_subvector for larger vectors.
// TODO: Investigate float/double extraction if it will be just stored.
if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
assert(SrcSVT == VT && "Unexpected extraction type");
SrcOp = DAG.getBitcast(SrcVT, SrcOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
return DAG.getZExtOrTrunc(ExtOp, dl, VT);
return SDValue();
/// Extracting a scalar FP value from vector element 0 is free, so extract each
/// operand first, then perform the math as a scalar op.
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
SDValue Vec = ExtElt->getOperand(0);
SDValue Index = ExtElt->getOperand(1);
EVT VT = ExtElt->getValueType(0);
EVT VecVT = Vec.getValueType();
// TODO: If this is a unary/expensive/expand op, allow extraction from a
// non-zero element because the shuffle+scalar op will be cheaper?
if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
return SDValue();
// Vector FP compares don't fit the pattern of FP math ops (propagate, not
// extract, the condition code), so deal with those as a special-case.
if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
if (OpVT != MVT::f32 && OpVT != MVT::f64)
return SDValue();
// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
SDLoc DL(ExtElt);
Vec.getOperand(0), Index);
Vec.getOperand(1), Index);
return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
if (VT != MVT::f32 && VT != MVT::f64)
return SDValue();
// Vector FP selects don't fit the pattern of FP math ops (because the
// condition has a different type and we have to change the opcode), so deal
// with those here.
// FIXME: This is restricted to pre type legalization by ensuring the setcc
// has i1 elements. If we loosen this we need to convert vector bool to a
// scalar bool.
if (Vec.getOpcode() == ISD::VSELECT &&
Vec.getOperand(0).getOpcode() == ISD::SETCC &&
Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
SDLoc DL(ExtElt);
Vec.getOperand(0), Index);
Vec.getOperand(1), Index);
Vec.getOperand(2), Index);
return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
// TODO: This switch could include FNEG and the x86-specific FP logic ops
// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
// missed load folding and fma+fneg combining.
switch (Vec.getOpcode()) {
case ISD::FMA: // Begin 3 operands
case ISD::FMAD:
case ISD::FADD: // Begin 2 operands
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
case X86ISD::FMAX:
case X86ISD::FMIN:
case ISD::FABS: // Begin 1 operand
case ISD::FSQRT:
case ISD::FRINT:
case ISD::FCEIL:
case X86ISD::FRCP:
case X86ISD::FRSQRT: {
// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
SDLoc DL(ExtElt);
SmallVector<SDValue, 4> ExtOps;
for (SDValue Op : Vec->ops())
ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
return SDValue();
llvm_unreachable("All opcodes should return within switch");
/// Try to convert a vector reduction sequence composed of binops and shuffles
/// into horizontal ops.
static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
// We need at least SSE2 to anything here.
if (!Subtarget.hasSSE2())
return SDValue();
ISD::NodeType Opc;
SDValue Rdx =
DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
if (!Rdx)
return SDValue();
SDValue Index = ExtElt->getOperand(1);
assert(isNullConstant(Index) &&
"Reduction doesn't end in an extract from index 0");
EVT VT = ExtElt->getValueType(0);
EVT VecVT = Rdx.getValueType();
if (VecVT.getScalarType() != VT)
return SDValue();
SDLoc DL(ExtElt);
// vXi8 reduction - sub 128-bit vector.
if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
if (VecVT == MVT::v4i8) {
// Pad with zero.
if (Subtarget.hasSSE41()) {
Rdx = DAG.getBitcast(MVT::i32, Rdx);
Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
DAG.getConstant(0, DL, MVT::v4i32), Rdx,
DAG.getIntPtrConstant(0, DL));
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
} else {
Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
DAG.getConstant(0, DL, VecVT));
if (Rdx.getValueType() == MVT::v8i8) {
// Pad with undef.
Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
DAG.getConstant(0, DL, MVT::v16i8));
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
// Must be a >=128-bit vector with pow2 elements.
if ((VecVT.getSizeInBits() % 128) != 0 ||
return SDValue();
// vXi8 reduction - sum lo/hi halves then use PSADBW.
if (VT == MVT::i8) {
while (Rdx.getValueSizeInBits() > 128) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
VecVT = Lo.getValueType();
Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
SDValue Hi = DAG.getVectorShuffle(
MVT::v16i8, DL, Rdx, Rdx,
{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
if (!shouldUseHorizontalOp(true, DAG, Subtarget))
return SDValue();
unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
// 256-bit horizontal instructions operate on 128-bit chunks rather than
// across the whole vector, so we need an extract + hop preliminary stage.
// This is the only step where the operands of the hop are not the same value.
// TODO: We could extend this to handle 512-bit or even longer vectors.
if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
unsigned NumElts = VecVT.getVectorNumElements();
SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
VecVT = Rdx.getValueType();
if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
return SDValue();
// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
for (unsigned i = 0; i != ReductionSteps; ++i)
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
/// scalars back, while for x64 we should use 64-bit extracts and shifts.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
SDValue InputVector = N->getOperand(0);
SDValue EltIdx = N->getOperand(1);
auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
EVT SrcVT = InputVector.getValueType();
EVT VT = N->getValueType(0);
SDLoc dl(InputVector);
bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
unsigned NumSrcElts = SrcVT.getVectorNumElements();
if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
// Integer Constant Folding.
if (CIdx && VT.isInteger()) {
APInt UndefVecElts;
SmallVector<APInt, 16> EltBits;
unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
EltBits, true, false)) {
uint64_t Idx = CIdx->getZExtValue();
if (UndefVecElts[Idx])
return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
dl, VT);
if (IsPextr) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(
SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
return SDValue(N, 0);
// PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
if ((InputVector.getOpcode() == X86ISD::PINSRB ||
InputVector.getOpcode() == X86ISD::PINSRW) &&
InputVector.getOperand(2) == EltIdx) {
assert(SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch");
SDValue Scl = InputVector.getOperand(1);
Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
return DAG.getZExtOrTrunc(Scl, dl, VT);
// TODO - Remove this once we can handle the implicit zero-extension of
// X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
// combineBasicSADPattern.
return SDValue();
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
SDValue MMXSrc = InputVector.getOperand(0);
// The bitcast source is a direct mmx result.
if (MMXSrc.getValueType() == MVT::x86mmx)
return DAG.getBitcast(VT, InputVector);
// Detect mmx to i32 conversion through a v2i32 elt extract.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
SDValue MMXSrc = InputVector.getOperand(0);
// The bitcast source is a direct mmx result.
if (MMXSrc.getValueType() == MVT::x86mmx)
return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
// Check whether this extract is the root of a sum of absolute differences
// pattern. This has to be done here because we really want it to happen
// pre-legalization,
if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
return SAD;
// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
return Cmp;
// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
return V;
if (SDValue V = scalarizeExtEltFP(N, DAG))
return V;
// Attempt to extract a i1 element by using MOVMSK to extract the signbits
// and then testing the relevant element.
// Note that we only combine extracts on the *same* result number, i.e.
// t0 = merge_values a0, a1, a2, a3
// i1 = extract_vector_elt t0, Constant:i64<2>
// i1 = extract_vector_elt t0, Constant:i64<3>
// but not
// i1 = extract_vector_elt t0:1, Constant:i64<2>
// since the latter would need its own MOVMSK.
if (CIdx && SrcVT.getScalarType() == MVT::i1) {
SmallVector<SDNode *, 16> BoolExtracts;
unsigned ResNo = InputVector.getResNo();
auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(Use->getOperand(1)) &&
Use->getOperand(0).getResNo() == ResNo &&
Use->getValueType(0) == MVT::i1) {
return true;
return false;
if (all_of(InputVector->uses(), IsBoolExtract) &&
BoolExtracts.size() > 1) {
EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
if (SDValue BC =
combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
for (SDNode *Use : BoolExtracts) {
// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
unsigned MaskIdx = Use->getConstantOperandVal(1);
APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
DCI.CombineTo(Use, Res);
return SDValue(N, 0);
return SDValue();
/// If a vector select has an operand that is -1 or 0, try to simplify the
/// select to a bitwise logic operation.
/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
static SDValue
combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
EVT VT = LHS.getValueType();
EVT CondVT = Cond.getValueType();
SDLoc DL(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (N->getOpcode() != ISD::VSELECT)
return SDValue();
assert(CondVT.isVector() && "Vector select expects a vector selector!");
// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
// TODO: Can we assert that both operands are not zeros (because that should
// get simplified at node creation time)?
bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
// If both inputs are 0/undef, create a complete zero vector.
// FIXME: As noted above this should be handled by DAGCombiner/getNode.
if (TValIsAllZeros && FValIsAllZeros) {
if (VT.isFloatingPoint())
return DAG.getConstantFP(0.0, DL, VT);
return DAG.getConstant(0, DL, VT);
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
// have already been promoted from the IR select condition type <N x i1>.
// Don't check if the types themselves are equal because that excludes
// vector floating-point selects.
if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
// Try to invert the condition if true value is not all 1s and false value is
// not all 0s. Only do this if the condition has one use.
bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
// Check if the selector will be produced by CMPP*/PCMP*.
Cond.getOpcode() == ISD::SETCC &&
// Check if SETCC has already been promoted.
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
CondVT) {
bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
if (TValIsAllZeros || FValIsAllOnes) {
SDValue CC = Cond.getOperand(2);
ISD::CondCode NewCC = ISD::getSetCCInverse(
cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
std::swap(LHS, RHS);
TValIsAllOnes = FValIsAllOnes;
FValIsAllZeros = TValIsAllZeros;
// Cond value must be 'sign splat' to be converted to a logical op.
if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
return SDValue();
// vselect Cond, 111..., 000... -> Cond
if (TValIsAllOnes && FValIsAllZeros)
return DAG.getBitcast(VT, Cond);
if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
return SDValue();
// vselect Cond, 111..., X -> or Cond, X
if (TValIsAllOnes) {
SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
return DAG.getBitcast(VT, Or);
// vselect Cond, X, 000... -> and Cond, X
if (FValIsAllZeros) {
SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
return DAG.getBitcast(VT, And);
// vselect Cond, 000..., X -> andn Cond, X
if (TValIsAllZeros) {
MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
return DAG.getBitcast(VT, AndN);
return SDValue();
/// If both arms of a vector select are concatenated vectors, split the select,
/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
return SDValue();
// TODO: Split 512-bit vectors too?
EVT VT = N->getValueType(0);
if (!VT.is256BitVector())
return SDValue();
// TODO: Split as long as any 2 of the 3 operands are concatenated?
SDValue Cond = N->getOperand(0);
SDValue TVal = N->getOperand(1);
SDValue FVal = N->getOperand(2);
SmallVector<SDValue, 4> CatOpsT, CatOpsF;
if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
!collectConcatOps(TVal.getNode(), CatOpsT) ||
!collectConcatOps(FVal.getNode(), CatOpsF))
return SDValue();
auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
makeBlend, /*CheckBWI*/ false);
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
SDLoc DL(N);
auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
if (!TrueC || !FalseC)
return SDValue();
// Don't do this for crazy integer types.
EVT VT = N->getValueType(0);
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
// We're going to use the condition bit in math or logic ops. We could allow
// this with a wider condition value (post-legalization it becomes an i8),
// but if nothing is creating selects that late, it doesn't matter.
if (Cond.getValueType() != MVT::i1)
return SDValue();
// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
// 3, 5, or 9 with i32/i64, so those get transformed too.
// TODO: For constants that overflow or do not differ by power-of-2 or small
// multiplier, convert to 'and' + 'add'.
const APInt &TrueVal = TrueC->getAPIntValue();
const APInt &FalseVal = FalseC->getAPIntValue();
bool OV;
APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
if (OV)
return SDValue();
APInt AbsDiff = Diff.abs();
if (AbsDiff.isPowerOf2() ||
((VT == MVT::i32 || VT == MVT::i64) &&
(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
// We need a positive multiplier constant for shift/LEA codegen. The 'not'
// of the condition can usually be folded into a compare predicate, but even
// without that, the sequence should be cheaper than a CMOV alternative.
if (TrueVal.slt(FalseVal)) {
Cond = DAG.getNOT(DL, Cond, MVT::i1);
std::swap(TrueC, FalseC);
// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
// Multiply condition by the difference if non-one.
if (!AbsDiff.isOneValue())
R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
// Add the base if non-zero.
if (!FalseC->isNullValue())
R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
return R;
return SDValue();
/// If this is a *dynamic* select (non-constant condition) and we can match
/// this node with one of the variable blend instructions, restructure the
/// condition so that blends can use the high (sign) bit of each element.
/// This function will also call SimplifyDemandedBits on already created
/// BLENDV to perform additional simplifications.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
if ((N->getOpcode() != ISD::VSELECT &&
N->getOpcode() != X86ISD::BLENDV) ||
return SDValue();
// Don't optimize before the condition has been transformed to a legal type
// and don't ever optimize vector selects that map to AVX512 mask-registers.
unsigned BitWidth = Cond.getScalarValueSizeInBits();
if (BitWidth < 8 || BitWidth > 64)
return SDValue();
// We can only handle the cases where VSELECT is directly legal on the
// subtarget. We custom lower VSELECT nodes with constant conditions and
// this makes it hard to see whether a dynamic VSELECT will correctly
// lower, so we both check the operation's status and explicitly handle the
// cases where a *dynamic* blend will fail even though a constant-condition
// blend could be custom lowered.
// FIXME: We should find a better way to handle this class of problems.
// Potentially, we should combine constant-condition vselect nodes
// pre-legalization into shuffles and not mark as many types as custom
// lowered.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
return SDValue();
// FIXME: We don't support i16-element blends currently. We could and
// should support them by making *all* the bits in the condition be set
// rather than just the high bit and using an i8-element blend.
if (VT.getVectorElementType() == MVT::i16)
return SDValue();
// Dynamic blending was only available from SSE4.1 onward.
if (VT.is128BitVector() && !Subtarget.hasSSE41())
return SDValue();
// Byte blends are only available in AVX2
if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
return SDValue();
// There are no 512-bit blend instructions that use sign bits.
if (VT.is512BitVector())
return SDValue();
auto OnlyUsedAsSelectCond = [](SDValue Cond) {
for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
UI != UE; ++UI)
if ((UI->getOpcode() != ISD::VSELECT &&
UI->getOpcode() != X86ISD::BLENDV) ||
UI.getOperandNo() != 0)
return false;
return true;
APInt DemandedBits(APInt::getSignMask(BitWidth));
if (OnlyUsedAsSelectCond(Cond)) {
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
return SDValue();
// If we changed the computation somewhere in the DAG, this change will
// affect all users of Cond. Update all the nodes so that we do not use
// the generic VSELECT anymore. Otherwise, we may perform wrong
// optimizations as we messed with the actual expectation for the vector
// boolean values.
for (SDNode *U : Cond->uses()) {
if (U->getOpcode() == X86ISD::BLENDV)
SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
Cond, U->getOperand(1), U->getOperand(2));
DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
return SDValue(N, 0);
// Otherwise we can still at least try to simplify multiple use bits.
if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
N->getOperand(1), N->getOperand(2));
return SDValue();
// Try to match:
// (or (and (M, (sub 0, X)), (pandn M, X)))
// which is a special case of:
// (select M, (sub 0, X), X)
// Per:
// We know that, if fNegate is 0 or 1:
// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
// ( M ? -X : X) == ((X ^ M ) + (M & 1))
// This lets us transform our vselect to:
// (add (xor X, M), (and M, 1))
// And further to:
// (sub (xor X, M), M)
static SDValue combineLogicBlendIntoConditionalNegate(
EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
EVT MaskVT = Mask.getValueType();
assert(MaskVT.isInteger() &&
DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits");
if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
return SDValue();
if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
return SDValue();
auto IsNegV = [](SDNode *N, SDValue V) {
return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
SDValue V;
if (IsNegV(Y.getNode(), X))
V = X;
else if (IsNegV(X.getNode(), Y))
V = Y;
return SDValue();
SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
SDValue SubOp2 = Mask;
// If the negate was on the false side of the select, then
// the operands of the SUB need to be swapped. PR 27251.
// This is because the pattern being matched above is
// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
// but if the pattern matched was
// (vselect M, X, (sub (0, X))), that is really negation of the pattern
// above, -(vselect M, (sub 0, X), X), and therefore the replacement
// pattern also needs to be a negation of the replacement pattern above.
// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
// sub accomplishes the negation of the replacement pattern.
if (V == Y)
std::swap(SubOp1, SubOp2);
SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
return DAG.getBitcast(VT, Res);
/// Do target-specific dag combines on SELECT and VSELECT nodes.
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue Cond = N->getOperand(0);
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
// Try simplification again because we use this function to optimize
// BLENDV nodes that are not handled by the generic combiner.
if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
return V;
EVT VT = LHS.getValueType();
EVT CondVT = Cond.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
if (CondVT.isVector() && CondVT.isInteger() &&
CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
(!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
DL, DAG, Subtarget))
return V;
// Convert vselects with constant condition into shuffles.
if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
SmallVector<int, 64> Mask;
if (createShuffleMaskFromVSELECT(Mask, Cond))
return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
VT != MVT::f80 && VT != MVT::f128 &&
(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget.hasSSE2() ||
(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
unsigned Opcode = 0;
// Check for x CC y ? x : y.
if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
switch (CC) {
default: break;
// Converting this to a min would handle NaNs incorrectly, and swapping
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
std::swap(LHS, RHS);
Opcode = X86ISD::FMIN;
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
Opcode = X86ISD::FMIN;
// Converting this to a min would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
case ISD::SETLT:
case ISD::SETLE:
Opcode = X86ISD::FMIN;
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
Opcode = X86ISD::FMAX;
// Converting this to a max would handle NaNs incorrectly, and swapping
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
std::swap(LHS, RHS);
Opcode = X86ISD::FMAX;
// Converting this to a max would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
case ISD::SETGT:
case ISD::SETGE:
Opcode = X86ISD::FMAX;
// Check for x CC y ? y : x -- a min/max with reversed arms.
} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
DAG.isEqualTo(RHS, Cond.getOperand(0))) {
switch (CC) {
default: break;
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS))) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
std::swap(LHS, RHS);
Opcode = X86ISD::FMIN;
// Converting this to a min would handle NaNs incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
Opcode = X86ISD::FMIN;
// Converting this to a min would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
case ISD::SETGT:
case ISD::SETGE:
Opcode = X86ISD::FMIN;
// Converting this to a max would handle NaNs incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
Opcode = X86ISD::FMAX;
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) &&
!DAG.isKnownNeverZeroFloat(RHS)) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
std::swap(LHS, RHS);
Opcode = X86ISD::FMAX;
// Converting this to a max would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
case ISD::SETLT:
case ISD::SETLE:
Opcode = X86ISD::FMAX;
if (Opcode)
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
// Some mask scalar intrinsics rely on checking if only one bit is set
// and implement it in C code like this:
// A[0] = (U & 1) ? A[0] : W[0];
// This creates some redundant instructions that break pattern matching.
// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue AndNode = Cond.getOperand(0);
if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
isNullConstant(Cond.getOperand(1)) &&
isOneConstant(AndNode.getOperand(1))) {
// LHS and RHS swapped due to
// setcc outputting 1 when AND resulted in 0 and vice versa.
AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
// lowering on KNL. In this case we convert it to
// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
// The same situation all vectors of i8 and i16 without BWI.
// Make sure we extend these even before type legalization gets a chance to
// split wide vectors.
// Since SKX these selects have a proper lowering.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1 &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
// AVX512 - Extend select with zero to merge with target shuffle.
// select(mask, extract_subvector(shuffle(x)), zero) -->
// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
// TODO - support non target shuffles as well.
if (Subtarget.hasAVX512() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1) {
auto SelectableOp = [&TLI](SDValue Op) {
return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
isTargetShuffle(Op.getOperand(0).getOpcode()) &&
isNullConstant(Op.getOperand(1)) &&
TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
Op.hasOneUse() && Op.getOperand(0).hasOneUse();
bool SelectableLHS = SelectableOp(LHS);
bool SelectableRHS = SelectableOp(RHS);
bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
: RHS.getOperand(0).getValueType();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
DAG.getUNDEF(SrcCondVT), Cond,
DAG.getIntPtrConstant(0, DL));
SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
if (SDValue V = combineSelectOfTwoConstants(N, DAG))
return V;
// Canonicalize max and min:
// (x > y) ? x : y -> (x >= y) ? x : y
// (x < y) ? x : y -> (x <= y) ? x : y
// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
// the need for an extra compare
// against zero. e.g.
// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
// subl %esi, %edi
// testl %edi, %edi
// movl $0, %eax
// cmovgl %edi, %eax
// =>
// xorl %eax, %eax
// subl %esi, $edi
// cmovsl %eax, %edi
if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
Cond.hasOneUse() &&
DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
switch (CC) {
default: break;
case ISD::SETLT:
case ISD::SETGT: {
Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), NewCC);
return DAG.getSelect(DL, VT, Cond, LHS, RHS);
// Match VSELECTs into subs with unsigned saturation.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
// psubus is available in SSE2 for i8 and i16 vectors.
Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
isPowerOf2_32(VT.getVectorNumElements()) &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
// Check if one of the arms of the VSELECT is a zero vector. If it's on the
// left side invert the predicate to simplify logic below.
SDValue Other;
if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
Other = RHS;
CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
Other = LHS;
if (Other.getNode() && Other->getNumOperands() == 2 &&
Other->getOperand(0) == Cond.getOperand(0)) {
SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
SDValue CondRHS = Cond->getOperand(1);
// Look for a general sub with unsigned saturation first.
// x >= y ? x-y : 0 --> subus x, y
// x > y ? x-y : 0 --> subus x, y
if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
if (isa<BuildVectorSDNode>(CondRHS)) {
// If the RHS is a constant we have to reverse the const
// canonicalization.
// x > C-1 ? x+-C : 0 --> subus x, C
auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
return (!Op && !Cond) ||
(Op && Cond &&
Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
/*AllowUndefs*/ true)) {
OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
// Another special case: If C was a sign bit, the sub has been
// canonicalized into a xor.
// FIXME: Would it be better to use computeKnownBits to determine
// whether it's safe to decanonicalize the xor?
// x s< 0 ? x^C : 0 --> subus x, C
if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
OpRHSConst->getAPIntValue().isSignMask()) {
// Note that we have to rebuild the RHS constant here to ensure we
// don't rely on particular values of undef lanes.
OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
// Match VSELECTs into add with unsigned saturation.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
// paddus is available in SSE2 for i8 and i16 vectors.
Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
isPowerOf2_32(VT.getVectorNumElements()) &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue CondLHS = Cond->getOperand(0);
SDValue CondRHS = Cond->getOperand(1);
// Check if one of the arms of the VSELECT is vector with all bits set.
// If it's on the left side invert the predicate to simplify logic below.
SDValue Other;
if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
Other = RHS;
CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
Other = LHS;
if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
// Canonicalize condition operands.
if (CC == ISD::SETUGE) {
std::swap(CondLHS, CondRHS);
// We can test against either of the addition operands.
// x <= x+y ? x+y : ~0 --> addus x, y
// x+y >= x ? x+y : ~0 --> addus x, y
if (CC == ISD::SETULE && Other == CondRHS &&
(OpLHS == CondLHS || OpRHS == CondLHS))
return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
CondLHS == OpLHS) {
// If the RHS is a constant we have to reverse the const
// canonicalization.
// x > ~C ? x+C : ~0 --> addus x, C
auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
return Cond->getAPIntValue() == ~Op->getAPIntValue();
if (CC == ISD::SETULE &&
ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
// Check if the first operand is all zeros and Cond type is vXi1.
// If this an avx512 target we can improve the use of zero masking by
// swapping the operands and inverting the condition.
if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
ISD::isBuildVectorAllZeros(LHS.getNode()) &&
!ISD::isBuildVectorAllZeros(RHS.getNode())) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
// Early exit check
if (!TLI.isTypeLegal(VT))
return SDValue();
if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
return V;
if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
return V;
if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
return V;
// select(~Cond, X, Y) -> select(Cond, Y, X)
if (CondVT.getScalarType() != MVT::i1)
if (SDValue CondNot = IsNOT(Cond, DAG))
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
// Try to optimize vXi1 selects if both operands are either all constants or
// bitcasts from scalar integer type. In that case we can convert the operands
// to integer and use an integer select which will be converted to a CMOV.
// We need to take a little bit of care to avoid creating an i64 type after
// type legalization.
if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
VT.getVectorElementType() == MVT::i1 &&
(DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
if ((LHSIsConst ||
(LHS.getOpcode() == ISD::BITCAST &&
LHS.getOperand(0).getValueType() == IntVT)) &&
(RHSIsConst ||
(RHS.getOpcode() == ISD::BITCAST &&
RHS.getOperand(0).getValueType() == IntVT))) {
if (LHSIsConst)
LHS = combinevXi1ConstantToInteger(LHS, DAG);
LHS = LHS.getOperand(0);
if (RHSIsConst)
RHS = combinevXi1ConstantToInteger(RHS, DAG);
RHS = RHS.getOperand(0);
SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
return DAG.getBitcast(VT, Select);
// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
// single bits, then invert the predicate and swap the select operands.
// This can lower using a vector shift bit-hack rather than mask and compare.
if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
Cond.getOperand(0).getOpcode() == ISD::AND &&
isNullOrNullSplat(Cond.getOperand(1)) &&
cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
Cond.getOperand(0).getValueType() == VT) {
// The 'and' mask must be composed of power-of-2 constants.
SDValue And = Cond.getOperand(0);
auto *C = isConstOrConstSplat(And.getOperand(1));
if (C && C->getAPIntValue().isPowerOf2()) {
// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
SDValue NotCond =
DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
// 16-bit lacks a proper blendv.
unsigned EltBitWidth = VT.getScalarSizeInBits();
bool CanShiftBlend =
TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
(Subtarget.hasAVX2() && EltBitWidth == 64) ||
if (CanShiftBlend &&
ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
return C->getAPIntValue().isPowerOf2();
})) {
// Create a left-shift constant to get the mask bits over to the sign-bit.
SDValue Mask = And.getOperand(1);
SmallVector<int, 32> ShlVals;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
ShlVals.push_back(EltBitWidth - 1 -
// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
SDValue NewCond =
DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
return SDValue();
/// Combine:
/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
/// to:
/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
/// Note that this is only legal for some op/cc combinations.
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// This combine only operates on CMP-like nodes.
if (!(Cmp.getOpcode() == X86ISD::CMP ||
(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
return SDValue();
// Can't replace the cmp if it has more uses than the one we're looking at.
// FIXME: We would like to be able to handle this, but would need to make sure
// all uses were updated.
if (!Cmp.hasOneUse())
return SDValue();
// This only applies to variations of the common case:
// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
// Using the proper condcodes (see below), overflow is checked for.
// FIXME: We can generalize both constraints:
// - XOR/OR/AND (if they were made to survive AtomicExpand)
// - LHS != 1
// if the result is compared.
SDValue CmpLHS = Cmp.getOperand(0);
SDValue CmpRHS = Cmp.getOperand(1);
if (!CmpLHS.hasOneUse())
return SDValue();
unsigned Opc = CmpLHS.getOpcode();
return SDValue();
SDValue OpRHS = CmpLHS.getOperand(2);
auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
if (!OpRHSC)
return SDValue();
APInt Addend = OpRHSC->getAPIntValue();
Addend = -Addend;
auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
if (!CmpRHSC)
return SDValue();
APInt Comparison = CmpRHSC->getAPIntValue();
// If the addend is the negation of the comparison value, then we can do
// a full comparison by emitting the atomic arithmetic as a locked sub.
if (Comparison == -Addend) {
// The CC is fine, but we need to rewrite the LHS of the comparison as an
// atomic sub.
auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
auto AtomicSub = DAG.getAtomic(
ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
/*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
return LockOp;
// We can handle comparisons with zero in a number of cases by manipulating
// the CC used.
if (!Comparison.isNullValue())
return SDValue();
if (CC == X86::COND_S && Addend == 1)
CC = X86::COND_LE;
else if (CC == X86::COND_NS && Addend == 1)
CC = X86::COND_G;
else if (CC == X86::COND_G && Addend == -1)
CC = X86::COND_GE;
else if (CC == X86::COND_LE && Addend == -1)
CC = X86::COND_L;
return SDValue();
SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
return LockOp;
// Check whether a boolean test is testing a boolean value generated by
// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
// code.
// Simplify the following patterns:
// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
// to (Op EFLAGS Cond)
// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
// to (Op EFLAGS !Cond)
// where Op could be BRCOND or CMOV.
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
// This combine only operates on CMP-like nodes.
if (!(Cmp.getOpcode() == X86ISD::CMP ||
(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
return SDValue();
// Quit if not used as a boolean value.
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
// Check CMP operands. One of them should be 0 or 1 and the other should be
// an SetCC or extended from it.
SDValue Op1 = Cmp.getOperand(0);
SDValue Op2 = Cmp.getOperand(1);
SDValue SetCC;
const ConstantSDNode* C = nullptr;
bool needOppositeCond = (CC == X86::COND_E);
bool checkAgainstTrue = false; // Is it a comparison against 1?
if ((C = dyn_cast<ConstantSDNode>(Op1)))
SetCC = Op2;
else if ((C = dyn_cast<ConstantSDNode>(Op2)))
SetCC = Op1;
else // Quit if all operands are not constants.
return SDValue();
if (C->getZExtValue() == 1) {
needOppositeCond = !needOppositeCond;
checkAgainstTrue = true;
} else if (C->getZExtValue() != 0)
// Quit if the constant is neither 0 or 1.
return SDValue();
bool truncatedToBoolWithAnd = false;
// Skip (zext $x), (trunc $x), or (and $x, 1) node.
while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
SetCC.getOpcode() == ISD::TRUNCATE ||
SetCC.getOpcode() == ISD::AND) {
if (SetCC.getOpcode() == ISD::AND) {
int OpIdx = -1;
if (isOneConstant(SetCC.getOperand(0)))
OpIdx = 1;
if (isOneConstant(SetCC.getOperand(1)))
OpIdx = 0;
if (OpIdx < 0)
SetCC = SetCC.getOperand(OpIdx);
truncatedToBoolWithAnd = true;
} else
SetCC = SetCC.getOperand(0);
switch (SetCC.getOpcode()) {
// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
// i.e. it's a comparison against true but the result of SETCC_CARRY is not
// truncated to i1 using 'and'.
if (checkAgainstTrue && !truncatedToBoolWithAnd)
assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
"Invalid use of SETCC_CARRY!");
case X86ISD::SETCC:
// Set the condition code or opposite one if necessary.
CC = X86::CondCode(SetCC.getConstantOperandVal(0));
if (needOppositeCond)
CC = X86::GetOppositeBranchCondition(CC);
return SetCC.getOperand(1);
case X86ISD::CMOV: {
// Check whether false/true value has canonical one, i.e. 0 or 1.
ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
// Quit if true value is not a constant.
if (!TVal)
return SDValue();
// Quit if false value is not a constant.
if (!FVal) {
SDValue Op = SetCC.getOperand(0);
// Skip 'zext' or 'trunc' node.
if (Op.getOpcode() == ISD::ZERO_EXTEND ||
Op.getOpcode() == ISD::TRUNCATE)
Op = Op.getOperand(0);
// A special case for rdrand/rdseed, where 0 is set if false cond is
// found.
if ((Op.getOpcode() != X86ISD::RDRAND &&
Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
return SDValue();
// Quit if false value is not the constant 0 or 1.
bool FValIsFalse = true;
if (FVal && FVal->getZExtValue() != 0) {
if (FVal->getZExtValue() != 1)
return SDValue();
// If FVal is 1, opposite cond is needed.
needOppositeCond = !needOppositeCond;
FValIsFalse = false;
// Quit if TVal is not the constant opposite of FVal.
if (FValIsFalse && TVal->getZExtValue() != 1)
return SDValue();
if (!FValIsFalse && TVal->getZExtValue() != 0)
return SDValue();
CC = X86::CondCode(SetCC.getConstantOperandVal(2));
if (needOppositeCond)
CC = X86::GetOppositeBranchCondition(CC);
return SetCC.getOperand(3);
return SDValue();
/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
/// Match:
/// (X86or (X86setcc) (X86setcc))
/// (X86cmp (and (X86setcc) (X86setcc)), 0)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
X86::CondCode &CC1, SDValue &Flags,
bool &isAnd) {
if (Cond->getOpcode() == X86ISD::CMP) {
if (!isNullConstant(Cond->getOperand(1)))
return false;
Cond = Cond->getOperand(0);
isAnd = false;
SDValue SetCC0, SetCC1;
switch (Cond->getOpcode()) {
default: return false;
case ISD::AND:
case X86ISD::AND:
isAnd = true;
case ISD::OR:
case X86ISD::OR:
SetCC0 = Cond->getOperand(0);
SetCC1 = Cond->getOperand(1);
// Make sure we have SETCC nodes, using the same flags value.
if (SetCC0.getOpcode() != X86ISD::SETCC ||
SetCC1.getOpcode() != X86ISD::SETCC ||
SetCC0->getOperand(1) != SetCC1->getOperand(1))
return false;
CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
Flags = SetCC0->getOperand(1);
return true;
// When legalizing carry, we create carries via add X, -1
// If that comes from an actual carry, via setcc, we use the
// carry directly.
static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
if (EFLAGS.getOpcode() == X86ISD::ADD) {
if (isAllOnesConstant(EFLAGS.getOperand(1))) {
SDValue Carry = EFLAGS.getOperand(0);
while (Carry.getOpcode() == ISD::TRUNCATE ||
Carry.getOpcode() == ISD::ZERO_EXTEND ||
Carry.getOpcode() == ISD::SIGN_EXTEND ||
Carry.getOpcode() == ISD::ANY_EXTEND ||
(Carry.getOpcode() == ISD::AND &&
Carry = Carry.getOperand(0);
if (Carry.getOpcode() == X86ISD::SETCC ||
Carry.getOpcode() == X86ISD::SETCC_CARRY) {
// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
uint64_t CarryCC = Carry.getConstantOperandVal(0);
SDValue CarryOp1 = Carry.getOperand(1);
if (CarryCC == X86::COND_B)
return CarryOp1;
if (CarryCC == X86::COND_A) {
// Try to convert COND_A into COND_B in an attempt to facilitate
// materializing "setb reg".
// Do not flip "e > c", where "c" is a constant, because Cmp
// instruction cannot take an immediate as its first operand.
if (CarryOp1.getOpcode() == X86ISD::SUB &&
CarryOp1.getNode()->hasOneUse() &&
CarryOp1.getValueType().isInteger() &&
!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
SDValue SubCommute =
DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
CarryOp1.getOperand(1), CarryOp1.getOperand(0));
return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
// If this is a check of the z flag of an add with 1, switch to the
// C flag.
if (CarryCC == X86::COND_E &&
CarryOp1.getOpcode() == X86ISD::ADD &&
return CarryOp1;
return SDValue();
/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
/// to avoid the inversion.
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (EFLAGS.getOpcode() != X86ISD::PTEST &&
EFLAGS.getOpcode() != X86ISD::TESTP)
return SDValue();
// TESTZ: ZF = (Op0 & Op1) == 0
// TESTC: CF = (~Op0 & Op1) == 0
// TESTNZC: ZF == 0 && CF == 0
EVT VT = EFLAGS.getValueType();
SDValue Op0 = EFLAGS.getOperand(0);
SDValue Op1 = EFLAGS.getOperand(1);
EVT OpVT = Op0.getValueType();
// TEST*(~X,Y) == TEST*(X,Y)
if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
X86::CondCode InvCC;
switch (CC) {
case X86::COND_B:
// testc -> testz.
InvCC = X86::COND_E;
case X86::COND_AE:
// !testc -> !testz.
InvCC = X86::COND_NE;
case X86::COND_E:
// testz -> testc.
InvCC = X86::COND_B;
case X86::COND_NE:
// !testz -> !testc.
InvCC = X86::COND_AE;
case X86::COND_A:
case X86::COND_BE:
// testnzc -> testnzc (no change).
InvCC = CC;
if (InvCC != X86::COND_INVALID) {
CC = InvCC;
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
DAG.getBitcast(OpVT, NotOp0), Op1);
if (CC == X86::COND_E || CC == X86::COND_NE) {
// TESTZ(X,~Y) == TESTC(Y,X)
if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
DAG.getBitcast(OpVT, NotOp1), Op0);
if (Op0 == Op1) {
SDValue BC = peekThroughBitcasts(Op0);
EVT BCVT = BC.getValueType();
assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
"Unexpected vector type");
if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
DAG.getBitcast(OpVT, BC.getOperand(0)),
DAG.getBitcast(OpVT, BC.getOperand(1)));
if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
DAG.getBitcast(OpVT, BC.getOperand(0)),
DAG.getBitcast(OpVT, BC.getOperand(1)));
// If every element is an all-sign value, see if we can use MOVMSK to
// more efficiently extract the sign bits and compare that.
// TODO: Handle TESTC with comparison inversion.
// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
// MOVMSK combines to make sure its never worse than PTEST?
unsigned EltBits = BCVT.getScalarSizeInBits();
if (DAG.ComputeNumSignBits(BC) == EltBits) {
assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
APInt SignMask = APInt::getSignMask(EltBits);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (SDValue Res =
TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
// For vXi16 cases we need to use pmovmksb and extract every other
// sign bit.
if (EltBits == 16) {
MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
Res = DAG.getBitcast(MovmskVT, Res);
Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
} else {
Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
DAG.getConstant(0, DL, MVT::i32));
// TESTZ(-1,X) == TESTZ(X,X)
if (ISD::isBuildVectorAllOnes(Op0.getNode()))
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
// TESTZ(X,-1) == TESTZ(X,X)
if (ISD::isBuildVectorAllOnes(Op1.getNode()))
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
return SDValue();
// Attempt to simplify the MOVMSK input based on the comparison type.
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Handle eq/ne against zero (any_of).
// Handle eq/ne against -1 (all_of).
if (!(CC == X86::COND_E || CC == X86::COND_NE))
return SDValue();
if (EFLAGS.getValueType() != MVT::i32)
return SDValue();
unsigned CmpOpcode = EFLAGS.getOpcode();
if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
return SDValue();
auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
if (!CmpConstant)
return SDValue();
const APInt &CmpVal = CmpConstant->getAPIntValue();
SDValue CmpOp = EFLAGS.getOperand(0);
unsigned CmpBits = CmpOp.getValueSizeInBits();
assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
// Peek through any truncate.
if (CmpOp.getOpcode() == ISD::TRUNCATE)
CmpOp = CmpOp.getOperand(0);
// Bail if we don't find a MOVMSK.
if (CmpOp.getOpcode() != X86ISD::MOVMSK)
return SDValue();
SDValue Vec = CmpOp.getOperand(0);
MVT VecVT = Vec.getSimpleValueType();
assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
"Unexpected MOVMSK operand");
unsigned NumElts = VecVT.getVectorNumElements();
unsigned NumEltBits = VecVT.getScalarSizeInBits();
bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
if (!IsAnyOf && !IsAllOf)
return SDValue();
// See if we can peek through to a vector with a wider element type, if the
// signbits extend down to all the sub-elements as well.
// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
// potential SimplifyDemandedBits/Elts cases.
if (Vec.getOpcode() == ISD::BITCAST) {
SDValue BC = peekThroughBitcasts(Vec);
MVT BCVT = BC.getSimpleValueType();
unsigned BCNumElts = BCVT.getVectorNumElements();
unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
BCNumEltBits > NumEltBits &&
DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
DAG.getConstant(CmpMask, DL, MVT::i32));
// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
if (IsAllOf && Subtarget.hasSSE41()) {
SDValue BC = peekThroughBitcasts(Vec);
if (BC.getOpcode() == X86ISD::PCMPEQ &&
ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
// See if we can avoid a PACKSS by calling MOVMSK on the sources.
// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
// sign bits prior to the comparison with zero unless we know that
// the vXi16 splats the sign bit down to the lower i8 half.
// TODO: Handle all_of patterns.
if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
SDValue VecOp0 = Vec.getOperand(0);
SDValue VecOp1 = Vec.getOperand(1);
bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
if (!SignExt0) {
Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
DAG.getConstant(0xAAAA, DL, MVT::i16));
return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
DAG.getConstant(0, DL, MVT::i16));
if (CmpBits == 16 && Subtarget.hasInt256() &&
VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
VecOp0.getConstantOperandAPInt(1) == 0 &&
VecOp1.getConstantOperandAPInt(1) == 8 &&
(IsAnyOf || (SignExt0 && SignExt1))) {
SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
if (!SignExt0 || !SignExt1) {
assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
DAG.getConstant(CmpMask, DL, MVT::i32));
// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
SmallVector<int, 32> ShuffleMask;
SmallVector<SDValue, 2> ShuffleInputs;
if (NumElts == CmpBits &&
getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
ShuffleMask, DAG) &&
ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
unsigned NumShuffleElts = ShuffleMask.size();
APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
for (int M : ShuffleMask) {
assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
if (DemandedElts.isAllOnesValue()) {
SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
Result =
DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
return SDValue();
/// Optimize an EFLAGS definition used according to the condition code \p CC
/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
/// uses of chain values.
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (CC == X86::COND_B)
if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
return Flags;
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
return R;
if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
return R;
if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
return R;
return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue FalseOp = N->getOperand(0);
SDValue TrueOp = N->getOperand(1);
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
SDValue Cond = N->getOperand(3);
// cmov X, X, ?, ? --> X
if (TrueOp == FalseOp)
return TrueOp;
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
if (!(FalseOp.getValueType() == MVT::f80 ||
(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
!Subtarget.hasCMov() || hasFPCMov(CC)) {
SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
// If this is a select between two integer constants, try to do some
// optimizations. Note that the operands are ordered the opposite of SELECT
// operands.
if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
// larger than FalseC (the false value).
if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
CC = X86::GetOppositeBranchCondition(CC);
std::swap(TrueC, FalseC);
std::swap(TrueOp, FalseOp);
// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
// This is efficient for any integer data type (including i8/i16) and
// shift amount.
if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
Cond = getSETCC(CC, Cond, DL, DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
unsigned ShAmt = TrueC->getAPIntValue().logBase2();
Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
DAG.getConstant(ShAmt, DL, MVT::i8));
return Cond;
// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
// for any integer data type, including i8/i16.
if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
Cond = getSETCC(CC, Cond, DL, DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
FalseC->getValueType(0), Cond);
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
SDValue(FalseC, 0));
return Cond;
// Optimize cases that will turn into an LEA instruction. This requires
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
"Implicit constant truncation");
bool isFastMultiplier = false;
if (Diff.ult(10)) {
switch (Diff.getZExtValue()) {
default: break;
case 1: // result = add base, cond
case 2: // result = lea base( , cond*2)
case 3: // result = lea base(cond, cond*2)
case 4: // result = lea base( , cond*4)
case 5: // result = lea base(cond, cond*4)
case 8: // result = lea base( , cond*8)
case 9: // result = lea base(cond, cond*8)
isFastMultiplier = true;
if (isFastMultiplier) {
Cond = getSETCC(CC, Cond, DL ,DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
// Scale the condition by the difference.
if (Diff != 1)
Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
DAG.getConstant(Diff, DL, Cond.getValueType()));
// Add the base if non-zero.
if (FalseC->getAPIntValue() != 0)
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
SDValue(FalseC, 0));
return Cond;
// Handle these cases:
// (select (x != c), e, c) -> select (x != c), e, x),
// (select (x == c), c, e) -> select (x == c), x, e)
// where the c is an integer constant, and the "select" is the combination
// of CMOV and CMP.
// The rationale for this change is that the conditional-move from a constant
// needs two instructions, however, conditional-move from a register needs
// only one instruction.
// CAVEAT: By replacing a constant with a symbolic value, it may obscure
// some instruction-combining opportunities. This opt needs to be
// postponed as late as possible.
if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
// the DCI.xxxx conditions are provided to postpone the optimization as
// late as possible.
ConstantSDNode *CmpAgainst = nullptr;
if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
!isa<ConstantSDNode>(Cond.getOperand(0))) {
if (CC == X86::COND_NE &&
CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
CC = X86::GetOppositeBranchCondition(CC);
std::swap(TrueOp, FalseOp);
if (CC == X86::COND_E &&
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
SDValue Ops[] = {FalseOp, Cond.getOperand(0),
DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
// Fold and/or of setcc's to double CMOV:
// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
// This combine lets us generate:
// cmovcc1 (jcc1 if we don't have CMOV)
// cmovcc2 (same)
// instead of:
// setcc1
// setcc2
// and/or
// cmovne (jne if we don't have CMOV)
// When we can't use the CMOV instruction, it might increase branch
// mispredicts.
// When we can use CMOV, or when there is no mispredict, this improves
// throughput and reduces register pressure.
if (CC == X86::COND_NE) {
SDValue Flags;
X86::CondCode CC0, CC1;
bool isAndSetCC;
if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
if (isAndSetCC) {
std::swap(FalseOp, TrueOp);
CC0 = X86::GetOppositeBranchCondition(CC0);
CC1 = X86::GetOppositeBranchCondition(CC1);
SDValue LOps[] = {FalseOp, TrueOp,
DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
return CMOV;
// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
if ((CC == X86::COND_NE || CC == X86::COND_E) &&
Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
SDValue Add = TrueOp;
SDValue Const = FalseOp;
// Canonicalize the condition code for easier matching and output.
if (CC == X86::COND_E)
std::swap(Add, Const);
// We might have replaced the constant in the cmov with the LHS of the
// compare. If so change it to the RHS of the compare.
if (Const == Cond.getOperand(0))
Const = Cond.getOperand(1);
// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
EVT VT = N->getValueType(0);
// This should constant fold.
SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
SDValue CMov =
DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
return SDValue();
/// Different mul shrinking modes.
enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
EVT VT = N->getOperand(0).getValueType();
if (VT.getScalarSizeInBits() != 32)
return false;
assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
unsigned SignBits[2] = {1, 1};
bool IsPositive[2] = {false, false};
for (unsigned i = 0; i < 2; i++) {
SDValue Opd = N->getOperand(i);
SignBits[i] = DAG.ComputeNumSignBits(Opd);
IsPositive[i] = DAG.SignBitIsZero(Opd);
bool AllPositive = IsPositive[0] && IsPositive[1];
unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
// When ranges are from -128 ~ 127, use MULS8 mode.
if (MinSignBits >= 25)
Mode = ShrinkMode::MULS8;
// When ranges are from 0 ~ 255, use MULU8 mode.
else if (AllPositive && MinSignBits >= 24)
Mode = ShrinkMode::MULU8;
// When ranges are from -32768 ~ 32767, use MULS16 mode.
else if (MinSignBits >= 17)
Mode = ShrinkMode::MULS16;
// When ranges are from 0 ~ 65535, use MULU16 mode.
else if (AllPositive && MinSignBits >= 16)
Mode = ShrinkMode::MULU16;
return false;
return true;
/// When the operands of vector mul are extended from smaller size values,
/// like i8 and i16, the type of mul may be shrinked to generate more
/// efficient code. Two typical patterns are handled:
/// Pattern1:
/// %2 = sext/zext <N x i8> %1 to <N x i32>
/// %4 = sext/zext <N x i8> %3 to <N x i32>
// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
/// %5 = mul <N x i32> %2, %4
/// Pattern2:
/// %2 = zext/sext <N x i16> %1 to <N x i32>
/// %4 = zext/sext <N x i16> %3 to <N x i32>
/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
/// %5 = mul <N x i32> %2, %4
/// There are four mul shrinking modes:
/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
/// generate pmullw+sext32 for it (MULS8 mode).
/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
/// generate pmullw+zext32 for it (MULU8 mode).
/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
/// generate pmullw+pmulhw for it (MULS16 mode).
/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
/// generate pmullw+pmulhuw for it (MULU16 mode).
static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Check for legality
// pmullw/pmulhw are not supported by SSE.
if (!Subtarget.hasSSE2())
return SDValue();
// Check for profitability
// pmulld is supported since SSE41. It is better to use pmulld
// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
// the expansion.
bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
return SDValue();
ShrinkMode Mode;
if (!canReduceVMulWidth(N, DAG, Mode))
return SDValue();
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getOperand(0).getValueType();
unsigned NumElts = VT.getVectorNumElements();
if ((NumElts % 2) != 0)
return SDValue();
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
// Shrink the operands of mul.
SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
// lower part is needed.
SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
DL, VT, MulLo);
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
// the higher part is also needed.
SDValue MulHi =
DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
ReducedVT, NewN0, NewN1);
// Repack the lower part and higher part result of mul into a wider
// result.
// Generate shuffle functioning as punpcklwd.
SmallVector<int, 16> ShuffleMask(NumElts);
for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
ShuffleMask[2 * i] = i;
ShuffleMask[2 * i + 1] = i + NumElts;
SDValue ResLo =
DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
ResLo = DAG.getBitcast(ResVT, ResLo);
// Generate shuffle functioning as punpckhwd.
for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
ShuffleMask[2 * i] = i + NumElts / 2;
ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
SDValue ResHi =
DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
ResHi = DAG.getBitcast(ResVT, ResHi);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
EVT VT, const SDLoc &DL) {
auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(Mult, DL, VT));
Result = DAG.getNode(ISD::SHL, DL, VT, Result,
DAG.getConstant(Shift, DL, MVT::i8));
Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
return Result;
auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(Mul1, DL, VT));
Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
DAG.getConstant(Mul2, DL, VT));
Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
return Result;
switch (MulAmt) {
case 11:
// mul x, 11 => add ((shl (mul x, 5), 1), x)
return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
case 21:
// mul x, 21 => add ((shl (mul x, 5), 2), x)
return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
case 41:
// mul x, 41 => add ((shl (mul x, 5), 3), x)
return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
case 22:
// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
case 19:
// mul x, 19 => add ((shl (mul x, 9), 1), x)
return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
case 37:
// mul x, 37 => add ((shl (mul x, 9), 2), x)
return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
case 73:
// mul x, 73 => add ((shl (mul x, 9), 3), x)
return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
case 13:
// mul x, 13 => add ((shl (mul x, 3), 2), x)
return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
case 23:
// mul x, 23 => sub ((shl (mul x, 3), 3), x)
return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
case 26:
// mul x, 26 => add ((mul (mul x, 5), 5), x)
return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
case 28:
// mul x, 28 => add ((mul (mul x, 9), 3), x)
return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
case 29:
// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
// by a single LEA.
// First check if this a sum of two power of 2s because that's easy. Then
// count how many zeros are up to the first bit.
// TODO: We can do this even without LEA at a cost of two shifts and an add.
if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
unsigned ScaleShift = countTrailingZeros(MulAmt);
if (ScaleShift >= 1 && ScaleShift < 4) {
unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(ShiftAmt, DL, MVT::i8));
SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(ScaleShift, DL, MVT::i8));
return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
return SDValue();
// If the upper 17 bits of each element are zero then we can use PMADDWD,
// which is always at least as quick as PMULLD, except on KNL.
static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2())
return SDValue();
if (Subtarget.isPMADDWDSlow())
return SDValue();
EVT VT = N->getValueType(0);
// Only support vXi32 vectors.
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
return SDValue();
// Make sure the type is legal or will be widened to a legal type.
if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
// Without BWI, we would need to split v32i16.
if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// If we are zero extending two steps without SSE4.1, its better to reduce
// the vmul width instead.
if (!Subtarget.hasSSE41() &&
(N0.getOpcode() == ISD::ZERO_EXTEND &&
N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
(N1.getOpcode() == ISD::ZERO_EXTEND &&
N1.getOperand(0).getScalarValueSizeInBits() <= 8))
return SDValue();
APInt Mask17 = APInt::getHighBitsSet(32, 17);
if (!DAG.MaskedValueIsZero(N1, Mask17) ||
!DAG.MaskedValueIsZero(N0, Mask17))
return SDValue();
// Use SplitOpsAndApply to handle AVX splitting.
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2())
return SDValue();
EVT VT = N->getValueType(0);
// Only support vXi64 vectors.
if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
VT.getVectorNumElements() < 2 ||
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// MULDQ returns the 64-bit result of the signed multiplication of the lower
// 32-bits. We can lower with this if the sign bits stretch that far.
if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
DAG.ComputeNumSignBits(N1) > 32) {
auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
PMULDQBuilder, /*CheckBWI*/false);
// If the upper bits are zero we can use a single pmuludq.
APInt Mask = APInt::getHighBitsSet(64, 32);
if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
PMULUDQBuilder, /*CheckBWI*/false);
return SDValue();
/// Optimize a single multiply with constant into two operations in order to
/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
return V;
if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
return V;
if (DCI.isBeforeLegalize() && VT.isVector())
return reduceVMULWidth(N, DAG, Subtarget);
if (!MulConstantOptimization)
return SDValue();
// An imul is usually smaller than the alternative sequence.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!C)
return SDValue();
if (isPowerOf2_64(C->getZExtValue()))
return SDValue();
int64_t SignMulAmt = C->getSExtValue();
assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
SDLoc DL(N);
if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(AbsMulAmt, DL, VT));
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
return NewMul;
uint64_t MulAmt1 = 0;
uint64_t MulAmt2 = 0;
if ((AbsMulAmt % 9) == 0) {
MulAmt1 = 9;
MulAmt2 = AbsMulAmt / 9;
} else if ((AbsMulAmt % 5) == 0) {
MulAmt1 = 5;
MulAmt2 = AbsMulAmt / 5;
} else if ((AbsMulAmt % 3) == 0) {
MulAmt1 = 3;
MulAmt2 = AbsMulAmt / 3;
SDValue NewMul;
// For negative multiply amounts, only allow MulAmt2 to be a power of 2.
if (MulAmt2 &&
(isPowerOf2_64(MulAmt2) ||
(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
if (isPowerOf2_64(MulAmt2) &&
!(SignMulAmt >= 0 && N->hasOneUse() &&
N->use_begin()->getOpcode() == ISD::ADD))
// If second multiplifer is pow2, issue it first. We want the multiply by
// 3, 5, or 9 to be folded into the addressing mode unless the lone use
// is an add. Only do this for positive multiply amounts since the
// negate would prevent it from being used as an address mode anyway.
std::swap(MulAmt1, MulAmt2);
if (isPowerOf2_64(MulAmt1))
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
DAG.getConstant(MulAmt1, DL, VT));
if (isPowerOf2_64(MulAmt2))
NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
DAG.getConstant(MulAmt2, DL, VT));
// Negate the result.
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
} else if (!Subtarget.slowLEA())
NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
if (!NewMul) {
assert(C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
"Both cases that could cause potential overflows should have "
"already been handled.");
if (isPowerOf2_64(AbsMulAmt - 1)) {
// (mul x, 2^N + 1) => (add (shl x, N), x)
NewMul = DAG.getNode(
ISD::ADD, DL, VT, N->getOperand(0),
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
// To negate, subtract the number from zero
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT,
DAG.getConstant(0, DL, VT), NewMul);
} else if (isPowerOf2_64(AbsMulAmt + 1)) {
// (mul x, 2^N - 1) => (sub (shl x, N), x)
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(AbsMulAmt + 1),
DL, MVT::i8));
// To negate, reverse the operands of the subtract.
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(AbsMulAmt - 2),
DL, MVT::i8));
NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(AbsMulAmt + 2),
DL, MVT::i8));
NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
return NewMul;
// Try to form a MULHU or MULHS node by looking for
// (srl (mul ext, ext), 16)
// TODO: This is X86 specific because we want to be able to handle wide types
// before type legalization. But we can only do it if the vector will be
// legalized via widening/splitting. Type legalization can't handle promotion
// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
// combiner.
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
"SRL or SRA node is required here!");
SDLoc DL(N);
// Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
// the multiply.
if (!Subtarget.hasSSE41())
return SDValue();
// The operation feeding into the shift must be a multiply.
SDValue ShiftOperand = N->getOperand(0);
if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
return SDValue();
// Input type should be at least vXi32.
EVT VT = N->getValueType(0);
if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
return SDValue();
// Need a shift by 16.
APInt ShiftAmt;
if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
ShiftAmt != 16)
return SDValue();
SDValue LHS = ShiftOperand.getOperand(0);
SDValue RHS = ShiftOperand.getOperand(1);
unsigned ExtOpc = LHS.getOpcode();
if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
RHS.getOpcode() != ExtOpc)
return SDValue();
// Peek through the extends.
LHS = LHS.getOperand(0);
RHS = RHS.getOperand(0);
// Ensure the input types match.
EVT MulVT = LHS.getValueType();
if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
return SDValue();
unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
return DAG.getNode(ExtOpc, DL, VT, Mulh);
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
EVT VT = N0.getValueType();
// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
// since the result of setcc_c is all zero's or all ones.
if (VT.isInteger() && !VT.isVector() &&
N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
APInt Mask = N0.getConstantOperandAPInt(1);
Mask <<= N1C->getAPIntValue();
bool MaskOK = false;
// We can handle cases concerning bit-widening nodes containing setcc_c if
// we carefully interrogate the mask to make sure we are semantics
// preserving.
// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
// of the underlying setcc_c operation if the setcc_c was zero extended.
// Consider the following example:
// zext(setcc_c) -> i32 0x0000FFFF
// c1 -> i32 0x0000FFFF
// c2 -> i32 0x00000001
// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
MaskOK = true;
} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
MaskOK = true;
} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
N00.getOpcode() == ISD::ANY_EXTEND) &&
N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
if (MaskOK && Mask != 0) {
SDLoc DL(N);
return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
// Hardware support for vector shifts is sparse which makes us scalarize the
// vector operations in many cases. Also, on sandybridge ADD is faster than
// shl.
// (shl V, 1) -> add V,V
if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
assert(N0.getValueType().isVector() && "Invalid vector shift type");
// We shift all of the values by one. In many cases we do not have
// hardware support for this operation. This is better expressed as an ADD
// of two values.
if (N1SplatC->isOne())
return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
return SDValue();
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
unsigned Size = VT.getSizeInBits();
if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
return V;
// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
// depending on sign of (SarConst - [56,48,32,24,16])
// sexts in X86 are MOVs. The MOVs have the same code size
// as above SHIFTs (only SHIFT on 1 has lower code size).
// However the MOVs have 2 advantages to a SHIFT:
// 1. MOVs can write to a register that differs from source
// 2. MOVs accept memory operands
if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
N0.getOperand(1).getOpcode() != ISD::Constant)
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
EVT CVT = N1.getValueType();
if (SarConst.isNegative())
return SDValue();
for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
unsigned ShiftSize = SVT.getSizeInBits();
// skipping types without corresponding sext/zext and
// ShlConst that is not one of [56,48,32,24,16]
if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
SDLoc DL(N);
SDValue NN =
DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
SarConst = SarConst - (Size - ShiftSize);
if (SarConst == 0)
return NN;
else if (SarConst.isNegative())
return DAG.getNode(ISD::SHL, DL, VT, NN,
DAG.getConstant(-SarConst, DL, CVT));
return DAG.getNode(ISD::SRA, DL, VT, NN,
DAG.getConstant(SarConst, DL, CVT));
return SDValue();
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
return V;
// Only do this on the last DAG combine as it can interfere with other
// combines.
if (!DCI.isAfterLegalizeDAG())
return SDValue();
// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
// TODO: This is a generic DAG combine that became an x86-only combine to
// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
// and-not ('andn').
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
return SDValue();
auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!ShiftC || !AndC)
return SDValue();
// If we can shrink the constant mask below 8-bits or 32-bits, then this
// transform should reduce code size. It may also enable secondary transforms
// from improved known-bits analysis or instruction selection.
APInt MaskVal = AndC->getAPIntValue();
// If this can be matched by a zero extend, don't optimize.
if (MaskVal.isMask()) {
unsigned TO = MaskVal.countTrailingOnes();
if (TO >= 8 && isPowerOf2_32(TO))
return SDValue();
APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
unsigned OldMaskSize = MaskVal.getMinSignedBits();
unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
(OldMaskSize > 32 && NewMaskSize <= 32)) {
// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
SDLoc DL(N);
SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
return SDValue();
static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected pack opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
unsigned NumDstElts = VT.getVectorNumElements();
// to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
// truncation trees that help us avoid lane crossing shuffles.
// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N0.getConstantOperandAPInt(1) == 0 &&
N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
N0.getOperand(0).getValueType().is256BitVector()) {
// TODO - support target/faux shuffles.
SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
// To keep the PACK LHS/RHS coherency, we must be able to scale the unary
// shuffle to a vXi64 width - we can probably relax this in the future.
SmallVector<int, 4> ShuffleMask;
if (SVN->getOperand(1).isUndef() &&
scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
SDLoc DL(N);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
Lo = DAG.getBitcast(N0.getValueType(), Lo);
Hi = DAG.getBitcast(N1.getValueType(), Hi);
SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
Res = DAG.getBitcast(MVT::v4i32, Res);
Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
return DAG.getBitcast(VT, Res);
// Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).
// TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
if (VT.is256BitVector()) {
if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
SDValue Op00 = SVN0->getOperand(0);
SDValue Op01 = SVN0->getOperand(1);
SDValue Op10 = SVN1->getOperand(0);
SDValue Op11 = SVN1->getOperand(1);
if ((Op00 == Op11) && (Op01 == Op10)) {
std::swap(Op10, Op11);
if ((Op00 == Op10) && (Op01 == Op11)) {
SmallVector<int, 4> ShuffleMask;
ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
SDLoc DL(N);
SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
Res = DAG.getBitcast(MVT::v4i64, Res);
Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);
return DAG.getBitcast(VT, Res);
return SDValue();
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected pack opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
unsigned NumDstElts = VT.getVectorNumElements();
unsigned DstBitsPerElt = VT.getScalarSizeInBits();
unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type");
bool IsSigned = (X86ISD::PACKSS == Opcode);
// Constant Folding.
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumSrcElts = NumDstElts / 2;
unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
APInt Undefs(NumDstElts, 0);
SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
if (UndefElts[SrcIdx]) {
Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
APInt &Val = EltBits[SrcIdx];
if (IsSigned) {
// PACKSS: Truncate signed value with signed saturation.
// Source values less than dst minint are saturated to minint.
// Source values greater than dst maxint are saturated to maxint.
if (Val.isSignedIntN(DstBitsPerElt))
Val = Val.trunc(DstBitsPerElt);
else if (Val.isNegative())
Val = APInt::getSignedMinValue(DstBitsPerElt);
Val = APInt::getSignedMaxValue(DstBitsPerElt);
} else {
// PACKUS: Truncate signed value with unsigned saturation.
// Source values less than zero are saturated to zero.
// Source values greater than dst maxuint are saturated to maxuint.
if (Val.isIntN(DstBitsPerElt))
Val = Val.trunc(DstBitsPerElt);
else if (Val.isNegative())
Val = APInt::getNullValue(DstBitsPerElt);
Val = APInt::getAllOnesValue(DstBitsPerElt);
Bits[Lane * NumDstEltsPerLane + Elt] = Val;
return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
if (SDValue V = combineVectorPackWithShuffle(N, DAG))
return V;
// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
// truncate to create a larger truncate.
if (Subtarget.hasAVX512() &&
N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
N0.getOperand(0).getValueType() == MVT::v8i32) {
if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
(!IsSigned &&
DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
if (Subtarget.hasVLX())
return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
// Widen input to v16i32 so we can truncate that.
SDLoc dl(N);
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
// Attempt to combine as shuffle.
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
X86ISD::VSRL == N->getOpcode()) &&
"Unexpected shift opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Shift zero -> zero.
if (ISD::isBuildVectorAllZeros(N0.getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
// Detect constant shift amounts.
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
EltBits[0].getZExtValue(), DAG);
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
KnownZero, DCI))
return SDValue(N, 0);
return SDValue();
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
X86ISD::VSRLI == Opcode) &&
"Unexpected shift opcode");
bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
"Unexpected value type");
assert(N->getOperand(1).getValueType() == MVT::i8 &&
"Unexpected shift amount type");
// Out of range logical bit shifts are guaranteed to be zero.
// Out of range arithmetic bit shifts splat the sign bit.
unsigned ShiftVal = N->getConstantOperandVal(1);
if (ShiftVal >= NumBitsPerElt) {
if (LogicalShift)
return DAG.getConstant(0, SDLoc(N), VT);
ShiftVal = NumBitsPerElt - 1;
// (shift X, 0) -> X
if (!ShiftVal)
return N0;
// (shift 0, C) -> 0
if (ISD::isBuildVectorAllZeros(N0.getNode()))
// N0 is all zeros or undef. We guarantee that the bits shifted into the
// result are all zeros, not undef.
return DAG.getConstant(0, SDLoc(N), VT);
// (VSRAI -1, C) -> -1
if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
// N0 is all ones or undef. We guarantee that the bits shifted into the
// result are all ones, not undef.
return DAG.getConstant(-1, SDLoc(N), VT);
// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
if (Opcode == N0.getOpcode()) {
unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
unsigned NewShiftVal = ShiftVal + ShiftVal2;
if (NewShiftVal >= NumBitsPerElt) {
// Out of range logical bit shifts are guaranteed to be zero.
// Out of range arithmetic bit shifts splat the sign bit.
if (LogicalShift)
return DAG.getConstant(0, SDLoc(N), VT);
NewShiftVal = NumBitsPerElt - 1;
return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
// We can decode 'whole byte' logical bit shifts as shuffles.
if (LogicalShift && (ShiftVal % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
// Constant Folding.
APInt UndefElts;
SmallVector<APInt, 32> EltBits;
if (N->isOnlyUserOf(N0.getNode()) &&
getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
assert(EltBits.size() == VT.getVectorNumElements() &&
"Unexpected shift value type");
// Undef elements need to fold to 0. It's possible SimplifyDemandedBits
// created an undef input due to no input bits being demanded, but user
// still expects 0 in other bits.
for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
APInt &Elt = EltBits[i];
if (UndefElts[i])
Elt = 0;
else if (X86ISD::VSHLI == Opcode)
Elt <<= ShiftVal;
else if (X86ISD::VSRAI == Opcode)
// Reset undef elements since they were zeroed above.
UndefElts = 0;
return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0),
APInt::getAllOnesValue(NumBitsPerElt), DCI))
return SDValue(N, 0);
return SDValue();
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion");
if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0),
APInt::getAllOnesValue(NumBitsPerElt), DCI))
return SDValue(N, 0);
// Attempt to combine insertion patterns to a shuffle.
if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned opcode;
// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
// we're requiring SSE2 for both.
if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CMP0 = N0.getOperand(1);
SDValue CMP1 = N1.getOperand(1);
SDLoc DL(N);
// The SETCCs should both refer to the same CMP.
if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
return SDValue();
SDValue CMP00 = CMP0->getOperand(0);
SDValue CMP01 = CMP0->getOperand(1);
EVT VT = CMP00.getValueType();
if (VT == MVT::f32 || VT == MVT::f64) {
bool ExpectingFlags = false;
// Check for any users that want flags:
for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
!ExpectingFlags && UI != UE; ++UI)
switch (UI->getOpcode()) {
case ISD::BR_CC:
ExpectingFlags = true;
case ISD::CopyToReg:
if (!ExpectingFlags) {
enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
X86::CondCode tmp = cc0;
cc0 = cc1;
cc1 = tmp;
if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
// FIXME: need symbolic constants for these magic numbers.
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
if (Subtarget.hasAVX512()) {
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
DAG.getTargetConstant(x86cc, DL, MVT::i8));
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
DAG.getConstant(0, DL, MVT::v16i1),
FSetCC, DAG.getIntPtrConstant(0, DL));
return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
SDValue OnesOrZeroesF =
DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
bool is64BitFP = (CMP00.getValueType() == MVT::f64);
MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
if (is64BitFP && !Subtarget.is64Bit()) {
// On a 32-bit target, we cannot bitcast the 64-bit float to a
// 64-bit integer, since that's not a legal type. Since
// OnesOrZeroesF is all ones of all zeroes, we don't need all the
// bits, but can do this little dance to extract the lowest 32 bits
// and work with those going forward.
SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
Vector32, DAG.getIntPtrConstant(0, DL));
IntVT = MVT::i32;
SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
DAG.getConstant(1, DL, IntVT));
SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
return OneBitOfTruth;
return SDValue();
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::AND);
MVT VT = N->getSimpleValueType(0);
if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
return SDValue();
SDValue X, Y;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
auto GetNot = [&VT, &DAG](SDValue V) {
// Basic X = NOT(Y) detection.
if (SDValue Not = IsNOT(V, DAG))
return Not;
if (V.getOpcode() == X86ISD::VBROADCAST) {
SDValue Src = V.getOperand(0);
EVT SrcVT = Src.getValueType();
if (!SrcVT.isVector())
return SDValue();
if (SDValue Not = IsNOT(Src, DAG))
return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
DAG.getBitcast(SrcVT, Not));
return SDValue();
if (SDValue Not = GetNot(N0)) {
X = Not;
Y = N1;
} else if (SDValue Not = GetNot(N1)) {
X = Not;
Y = N0;
} else
return SDValue();
X = DAG.getBitcast(VT, X);
Y = DAG.getBitcast(VT, Y);
return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
// logical operations, like in the example below.
// or (and (truncate x, truncate y)),
// (xor (truncate z, build_vector (constants)))
// Given a target type \p VT, we generate
// or (and x, y), (xor z, zext(build_vector (constants)))
// given x, y and z are of type \p VT. We can do so, if operands are either
// truncates from VT types, the second operand is a vector of constants or can
// be recursively promoted.
static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
unsigned Depth) {
// Limit recursion to avoid excessive compile times.
if (Depth >= SelectionDAG::MaxRecursionDepth)
return SDValue();
if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
N->getOpcode() != ISD::OR)
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
return SDValue();
if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
N0 = NN0;
else {
// The Left side has to be a trunc.
if (N0.getOpcode() != ISD::TRUNCATE)
return SDValue();
// The type of the truncated inputs.
if (N0.getOperand(0).getValueType() != VT)
return SDValue();
N0 = N0.getOperand(0);
if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
N1 = NN1;
else {
// The right side has to be a 'trunc' or a constant vector.
bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
N1.getOperand(0).getValueType() == VT;
if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
return SDValue();
if (RHSTrunc)
N1 = N1.getOperand(0);
N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
// register. In most cases we actually compare or select YMM-sized registers
// and mixing the two types creates horrible code. This method optimizes
// some of the transition sequences.
// Even with AVX-512 this is still useful for removing casts around logical
// operations on vXi1 mask types.
static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Expected vector type");
SDLoc DL(N);
assert((N->getOpcode() == ISD::ANY_EXTEND ||
N->getOpcode() == ISD::ZERO_EXTEND ||
N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
SDValue Narrow = N->getOperand(0);
EVT NarrowVT = Narrow.getValueType();
// Generate the wide operation.
SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
if (!Op)
return SDValue();
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode");
return Op;
return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
Op, DAG.getValueType(NarrowVT));
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
unsigned FPOpcode;
switch (Opcode) {
default: llvm_unreachable("Unexpected input node for FP logic conversion");
case ISD::AND: FPOpcode = X86ISD::FAND; break;
case ISD::OR: FPOpcode = X86ISD::FOR; break;
case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
return FPOpcode;
/// If both input operands of a logic op are being cast from floating point
/// types, try to convert this into a floating point logic node to avoid
/// unnecessary moves from SSE to integer registers.
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N10 = N1.getOperand(0);
EVT N00Type = N00.getValueType();
EVT N10Type = N10.getValueType();
// Ensure that both types are the same and are legal scalar fp types.
if (N00Type != N10Type ||
!((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
(Subtarget.hasSSE2() && N00Type == MVT::f64)))
return SDValue();
unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
return DAG.getBitcast(VT, FPLogic);
// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
// to reduce XMM->GPR traffic.
static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
unsigned Opc = N->getOpcode();
assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
"Unexpected bit opcode");
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Both operands must be single use MOVMSK.
if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
return SDValue();
SDValue Vec0 = N0.getOperand(0);
SDValue Vec1 = N1.getOperand(0);
EVT VecVT0 = Vec0.getValueType();
EVT VecVT1 = Vec1.getValueType();
// Both MOVMSK operands must be from vectors of the same size and same element
// size, but its OK for a fp/int diff.
if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
return SDValue();
SDLoc DL(N);
unsigned VecOpc =
VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
SDValue Result =
DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
/// If this is a zero/all-bits result that is bitwise-anded with a low bits
/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
/// with a shift-right to eliminate loading the vector constant mask value.
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
EVT VT0 = Op0.getValueType();
EVT VT1 = Op1.getValueType();
if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
return SDValue();
APInt SplatVal;
if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
return SDValue();
// Don't prevent creation of ANDN.
if (isBitwiseNot(Op0))
return SDValue();
if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
return SDValue();
unsigned EltBitWidth = VT0.getScalarSizeInBits();
if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
return SDValue();
SDLoc DL(N);
unsigned ShiftVal = SplatVal.countTrailingOnes();
SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
return DAG.getBitcast(N->getValueType(0), Shift);
// Get the index node from the lowered DAG of a GEP IR instruction with one
// indexing dimension.
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
if (Ld->isIndexed())
return SDValue();
SDValue Base = Ld->getBasePtr();
if (Base.getOpcode() != ISD::ADD)
return SDValue();
SDValue ShiftedIndex = Base.getOperand(0);
if (ShiftedIndex.getOpcode() != ISD::SHL)
return SDValue();
return ShiftedIndex.getOperand(0);
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
switch (VT.getSizeInBits()) {
default: return false;
case 64: return Subtarget.is64Bit() ? true : false;
case 32: return true;
return false;
// This function recognizes cases where X86 bzhi instruction can replace and
// 'and-load' sequence.
// In case of loading integer value from an array of constants which is defined
// as follows:
// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
// then applying a bitwise and on the result with another input.
// It's equivalent to performing bzhi (zero high bits) on the input, with the
// same index of the load.
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Node->getSimpleValueType(0);
SDLoc dl(Node);
// Check if subtarget has BZHI instruction for the node's type
if (!hasBZHI(Subtarget, VT))
return SDValue();
// Try matching the pattern for both operands.
for (unsigned i = 0; i < 2; i++) {
SDValue N = Node->getOperand(i);
LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
// continue if the operand is not a load instruction
if (!Ld)
return SDValue();
const Value *MemOp = Ld->getMemOperand()->getValue();
if (!MemOp)
return SDValue();
if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
Constant *Init = GV->getInitializer();
Type *Ty = Init->getType();
if (!isa<ConstantDataArray>(Init) ||
!Ty->getArrayElementType()->isIntegerTy() ||
Ty->getArrayElementType()->getScalarSizeInBits() !=
VT.getSizeInBits() ||
Ty->getArrayNumElements() >
// Check if the array's constant elements are suitable to our case.
uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
bool ConstantsMatch = true;
for (uint64_t j = 0; j < ArrayElementCount; j++) {
ConstantInt *Elem =
if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
ConstantsMatch = false;
if (!ConstantsMatch)
// Do the transformation (For 32-bit type):
// -> (and (load arr[idx]), inp)
// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
// that will be replaced with one bzhi instruction.
SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
// Get the Node which indexes into the array.
SDValue Index = getIndexFromUnindexedLoad(Ld);
if (!Index)
return SDValue();
Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
return SDValue();
// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
// Turn it into series of XORs and a setnp.
static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
// We only support 64-bit and 32-bit. 64-bit requires special handling
// unless the 64-bit popcnt instruction is legal.
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// LHS needs to be a single use CTPOP.
if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
return SDValue();
// RHS needs to be 1.
if (!isOneConstant(N1))
return SDValue();
SDLoc DL(N);
SDValue X = N0.getOperand(0);
// If this is 64-bit, its always best to xor the two 32-bit pieces together
// even if we have popcnt.
if (VT == MVT::i64) {
SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
DAG.getNode(ISD::SRL, DL, VT, X,
DAG.getConstant(32, DL, MVT::i8)));
SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
// Generate a 32-bit parity idiom. This will bring us back here if we need
// to expand it too.
SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
DAG.getConstant(1, DL, MVT::i32));
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
assert(VT == MVT::i32 && "Unexpected VT!");
// Xor the high and low 16-bits together using a 32-bit operation.
SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
DAG.getConstant(16, DL, MVT::i8));
X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
// This should allow an h-reg to be used to save a shift.
// FIXME: We only get an h-reg in 32-bit mode.
SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
DAG.getNode(ISD::SRL, DL, VT, X,
DAG.getConstant(8, DL, MVT::i8)));
SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
// Copy the inverse of the parity flag into a register with setcc.
SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
// Zero extend to original type.
return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
// Where C is a mask containing the same number of bits as the setcc and
// where the setcc will freely 0 upper bits of k-register. We can replace the
// undef in the concat with 0s and remove the AND. This mainly helps with
// v2i1/v4i1 setcc being casted to scalar.
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
EVT VT = N->getValueType(0);
// Make sure this is an AND with constant. We will check the value of the
// constant later.
if (!isa<ConstantSDNode>(N->getOperand(1)))
return SDValue();
// This is implied by the ConstantSDNode.
assert(!VT.isVector() && "Expected scalar VT!");
if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
!N->getOperand(0).hasOneUse() ||
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Src = N->getOperand(0).getOperand(0);
EVT SrcVT = Src.getValueType();
if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
return SDValue();
if (Src.getOpcode() != ISD::CONCAT_VECTORS)
return SDValue();
// We only care about the first subvector of the concat, we expect the
// other subvectors to be ignored due to the AND if we make the change.
SDValue SubVec = Src.getOperand(0);
EVT SubVecVT = SubVec.getValueType();
// First subvector should be a setcc with a legal result type. The RHS of the
// AND should be a mask with this many bits.
if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
return SDValue();
EVT SetccVT = SubVec.getOperand(0).getValueType();
if (!TLI.isTypeLegal(SetccVT) ||
!(Subtarget.hasVLX() || SetccVT.is512BitVector()))
return SDValue();
if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
return SDValue();
// We passed all the checks. Rebuild the concat_vectors with zeroes
// and cast it back to VT.
SDLoc dl(N);
SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
DAG.getConstant(0, dl, SubVecVT));
Ops[0] = SubVec;
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
return DAG.getBitcast(VT, Concat);
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
// If this is SSE1 only convert to FAND to avoid scalarization.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
return DAG.getBitcast(
MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
// Use a 32-bit and+zext if upper bits known zero.
if (VT == MVT::i64 && Subtarget.is64Bit() &&
!isa<ConstantSDNode>(N->getOperand(1))) {
APInt HiMask = APInt::getHighBitsSet(64, 32);
if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
SDLoc dl(N);
SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
// This must be done before legalization has expanded the ctpop.
if (SDValue V = combineParity(N, DAG, Subtarget))
return V;
// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
SmallVector<APInt, 2> SrcPartials;
if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
SrcOps.size() == 1) {
SDLoc dl(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
if (Mask) {
assert(SrcPartials[0].getBitWidth() == NumElts &&
"Unexpected partial reduction mask");
SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
return V;
if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
return R;
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
return R;
if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
return ShiftRight;
if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
return R;
// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
// Attempt to combine a scalar bitmask AND with an extracted shuffle.
if ((VT.getScalarSizeInBits() % 8) == 0 &&
N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
SDValue BitMask = N->getOperand(1);
SDValue SrcVec = N->getOperand(0).getOperand(0);
EVT SrcVecVT = SrcVec.getValueType();
// Check that the constant bitmask masks whole bytes.
APInt UndefElts;
SmallVector<APInt, 64> EltBits;
if (VT == SrcVecVT.getScalarType() &&
N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
llvm::all_of(EltBits, [](APInt M) {
return M.isNullValue() || M.isAllOnesValue();
})) {
unsigned NumElts = SrcVecVT.getVectorNumElements();
unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
// Create a root shuffle mask from the byte mask and the extracted index.
SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
for (unsigned i = 0; i != Scale; ++i) {
if (UndefElts[i])
int VecIdx = Scale * Idx + i;
ShuffleMask[VecIdx] =
EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
if (SDValue Shuffle = combineX86ShufflesRecursively(
{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
return SDValue();
// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
MVT VT = N->getSimpleValueType(0);
if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
return SDValue();
SDValue N0 = peekThroughBitcasts(N->getOperand(0));
SDValue N1 = peekThroughBitcasts(N->getOperand(1));
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
return SDValue();
// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
!N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
return SDValue();
// Attempt to extract constant byte masks.
APInt UndefElts0, UndefElts1;
SmallVector<APInt, 32> EltBits0, EltBits1;
if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
false, false))
return SDValue();
if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
false, false))
return SDValue();
for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
// TODO - add UNDEF elts support.
if (UndefElts0[i] || UndefElts1[i])
return SDValue();
if (EltBits0[i] != ~EltBits1[i])
return SDValue();
SDLoc DL(N);
// Emit a VPTERNLOG node directly.
SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
SDValue X = N->getOperand(0);
SDValue Y =
DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
DAG.getBitcast(VT, N1.getOperand(0)));
return DAG.getNode(ISD::OR, DL, VT, X, Y);
// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
if (N->getOpcode() != ISD::OR)
return false;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Canonicalize AND to LHS.
if (N1.getOpcode() == ISD::AND)
std::swap(N0, N1);
// Attempt to match OR(AND(M,Y),ANDNP(M,X)).
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
return false;
Mask = N1.getOperand(0);
X = N1.getOperand(1);
// Check to see if the mask appeared in both the AND and ANDNP.
if (N0.getOperand(0) == Mask)
Y = N0.getOperand(1);
else if (N0.getOperand(1) == Mask)
Y = N0.getOperand(0);
return false;
// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
// ANDNP combine allows other combines to happen that prevent matching.
return true;
// Try to fold:
// (or (and (m, y), (pandn m, x)))
// into:
// (vselect m, x, y)
// As a special case, try to fold:
// (or (and (m, (sub 0, x)), (pandn m, x)))
// into:
// (sub (xor X, M), M)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
EVT VT = N->getValueType(0);
if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
(VT.is256BitVector() && Subtarget.hasInt256())))
return SDValue();
SDValue X, Y, Mask;
if (!matchLogicBlend(N, X, Y, Mask))
return SDValue();
// Validate that X, Y, and Mask are bitcasts, and see through them.
Mask = peekThroughBitcasts(Mask);
X = peekThroughBitcasts(X);
Y = peekThroughBitcasts(Y);
EVT MaskVT = Mask.getValueType();
unsigned EltBits = MaskVT.getScalarSizeInBits();
// TODO: Attempt to handle floating point cases as well?
if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
return SDValue();
SDLoc DL(N);
// Attempt to combine to conditional negate: (sub (xor X, M), M)
if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
DAG, Subtarget))
return Res;
// PBLENDVB is only available on SSE 4.1.
if (!Subtarget.hasSSE41())
return SDValue();
// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
if (Subtarget.hasVLX())
return SDValue();
MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
X = DAG.getBitcast(BlendVT, X);
Y = DAG.getBitcast(BlendVT, Y);
Mask = DAG.getBitcast(BlendVT, Mask);
Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
return DAG.getBitcast(VT, Mask);
// Helper function for combineOrCmpEqZeroToCtlzSrl
// Transforms:
// seteq(cmp x, 0)
// into:
// srl(ctlz x), log2(bitsize(x))
// Input pattern is checked by caller.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
SelectionDAG &DAG) {
SDValue Cmp = Op.getOperand(1);
EVT VT = Cmp.getOperand(0).getValueType();
unsigned Log2b = Log2_32(VT.getSizeInBits());
SDLoc dl(Op);
SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
// The result of the shift is true or false, and on X86, the 32-bit
// encoding of shr and lzcnt is more desirable.
SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
DAG.getConstant(Log2b, dl, MVT::i8));
return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
// Try to transform:
// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
// into:
// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
// Will also attempt to match more generic cases, eg:
// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
// Only applies if the target supports the FastLZCNT feature.
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
return SDValue();
auto isORCandidate = [](SDValue N) {
return (N->getOpcode() == ISD::OR && N->hasOneUse());
// Check the zero extend is extending to 32-bit or more. The code generated by
// srl(ctlz) for 16-bit or less variants of the pattern would require extra
// instructions to clear the upper bits.
if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
return SDValue();
// Check the node matches: setcc(eq, cmp 0)
auto isSetCCCandidate = [](SDValue N) {
return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
N->getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(N->getOperand(1).getOperand(1)) &&
SDNode *OR = N->getOperand(0).getNode();
SDValue LHS = OR->getOperand(0);
SDValue RHS = OR->getOperand(1);
// Save nodes matching or(or, setcc(eq, cmp 0)).
SmallVector<SDNode *, 2> ORNodes;
while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
LHS = OR->getOperand(0);
RHS = OR->getOperand(1);
// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
!isORCandidate(SDValue(OR, 0)))
return SDValue();
// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
// to
// or(srl(ctlz),srl(ctlz)).
// The dag combiner can then fold it into:
// srl(or(ctlz, ctlz)).
EVT VT = OR->getValueType(0);
SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
SDValue Ret, NewRHS;
if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
if (!Ret)
return SDValue();
// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
while (ORNodes.size() > 0) {
OR = ORNodes.pop_back_val();
LHS = OR->getOperand(0);
RHS = OR->getOperand(1);
// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
if (RHS->getOpcode() == ISD::OR)
std::swap(LHS, RHS);
NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
if (!NewRHS)
return SDValue();
Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
if (Ret)
Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
return Ret;
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
// If this is SSE1 only convert to FOR to avoid scalarization.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
return DAG.getBitcast(MVT::v4i32,
DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
DAG.getBitcast(MVT::v4f32, N0),
DAG.getBitcast(MVT::v4f32, N1)));
// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
SmallVector<APInt, 2> SrcPartials;
if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
SrcOps.size() == 1) {
SDLoc dl(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
if (Mask) {
assert(SrcPartials[0].getBitWidth() == NumElts &&
"Unexpected partial reduction mask");
SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
return R;
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
return R;
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
// iff the upper elements of the non-shifted arg are zero.
// KUNPCK require 16+ bool vector elements.
if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfElts = NumElts / 2;
APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
N1.getConstantOperandAPInt(1) == HalfElts &&
DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
SDLoc dl(N);
return DAG.getNode(
extractSubVector(N0, 0, DAG, dl, HalfElts),
extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
N0.getConstantOperandAPInt(1) == HalfElts &&
DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
SDLoc dl(N);
return DAG.getNode(
extractSubVector(N1, 0, DAG, dl, HalfElts),
extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
/// Try to turn tests against the signbit in the form of:
/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
/// into:
/// SETGT(X, -1)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
// This is only worth doing if the output type is i8 or i1.
EVT ResultType = N->getValueType(0);
if (ResultType != MVT::i8 && ResultType != MVT::i1)
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// We should be performing an xor against a truncated shift.
if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
return SDValue();
// Make sure we are performing an xor against one.
if (!isOneConstant(N1))
return SDValue();
// SetCC on x86 zero extends so only act on this if it's a logical shift.
SDValue Shift = N0.getOperand(0);
if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
return SDValue();
// Make sure we are truncating from one of i16, i32 or i64.
EVT ShiftTy = Shift.getValueType();
if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
return SDValue();
// Make sure the shift amount extracts the sign bit.
if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1.
// N.B. Using SETGE against 0 works but we want a canonical looking
// comparison, using SETGT matches up with what TranslateX86CC.
SDLoc DL(N);
SDValue ShiftOp = Shift.getOperand(0);
EVT ShiftOpTy = ShiftOp.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
*DAG.getContext(), ResultType);
SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
if (SetCCResultType != ResultType)
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
return Cond;
/// Turn vector tests of the signbit in the form of:
/// xor (sra X, elt_size(X)-1), -1
/// into:
/// pcmpgt X, -1
/// This should be called before type legalization because the pattern may not
/// persist after that.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (!VT.isSimple())
return SDValue();
switch (VT.getSimpleVT().SimpleTy) {
default: return SDValue();
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
// There must be a shift right algebraic before the xor, and the xor must be a
// 'not' operation.
SDValue Shift = N->getOperand(0);
SDValue Ones = N->getOperand(1);
if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
return SDValue();
// The shift should be smearing the sign bit across each vector element.
auto *ShiftAmt =
isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
if (!ShiftAmt ||
ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
return SDValue();
// Create a greater-than comparison against -1. We don't use the more obvious
// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
/// Detect patterns of truncation with unsigned saturation:
/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
/// Return the source value x to be truncated or SDValue() if the pattern was
/// not matched.
/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
/// where C1 >= 0 and C2 is unsigned max of destination type.
/// (truncate (smax (smin (x, C2), C1)) to dest_type)
/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
/// These two patterns are equivalent to:
/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
/// So return the smax(x, C1) value to be truncated or SDValue() if the
/// pattern was not matched.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const SDLoc &DL) {
EVT InVT = In.getValueType();
// Saturation with truncation. We truncate from InVT to VT.
assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation");
// Match min/max and return limit value as a parameter.
auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
if (V.getOpcode() == Opcode &&
ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
return V.getOperand(0);
return SDValue();
APInt C1, C2;
if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
// the element size of the destination type.
if (C2.isMask(VT.getScalarSizeInBits()))
return UMin;
if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
if (MatchMinMax(SMin, ISD::SMAX, C1))
if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
return SMin;
if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
C2.uge(C1)) {
return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
return SDValue();
/// Detect patterns of truncation with signed saturation:
/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
/// signed_max_of_dest_type)) to dest_type)
/// or:
/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
/// signed_min_of_dest_type)) to dest_type).
/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
/// Return the source value to be truncated or SDValue() if the pattern was not
/// matched.
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
unsigned NumDstBits = VT.getScalarSizeInBits();
unsigned NumSrcBits = In.getScalarValueSizeInBits();
assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
auto MatchMinMax = [](SDValue V, unsigned Opcode,
const APInt &Limit) -> SDValue {
APInt C;
if (V.getOpcode() == Opcode &&
ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
return V.getOperand(0);
return SDValue();
APInt SignedMax, SignedMin;
if (MatchPackUS) {
SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
SignedMin = APInt(NumSrcBits, 0);
} else {
SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
return SMax;
if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
return SMin;
return SDValue();
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2() || !VT.isVector())
return SDValue();
EVT SVT = VT.getVectorElementType();
EVT InVT = In.getValueType();
EVT InSVT = InVT.getVectorElementType();
// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
// split across two registers. We can use a packusdw+perm to clamp to 0-65535
// and concatenate at the same time. Then we can use a final vpmovuswb to
// clip to 0-255.
if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
InVT == MVT::v16i32 && VT == MVT::v16i8) {
if (auto USatVal = detectSSatPattern(In, VT, true)) {
// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
DL, DAG, Subtarget);
assert(Mid && "Failed to pack!");
return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
// vXi32 truncate instructions are available with AVX512F.
// vXi16 truncate instructions are only available with AVX512BW.
// For 256-bit or smaller vectors, we require VLX.
// FIXME: We could widen truncates to 512 to remove the VLX restriction.
// If the result type is 256-bits or larger and we have disable 512-bit
// registers, we should go ahead and use the pack instructions if possible.
bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
(Subtarget.hasBWI() && InSVT == MVT::i16)) &&
(InVT.getSizeInBits() > 128) &&
(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
VT.getSizeInBits() >= 64 &&
(SVT == MVT::i8 || SVT == MVT::i16) &&
(InSVT == MVT::i16 || InSVT == MVT::i32)) {
if (auto USatVal = detectSSatPattern(In, VT, true)) {
// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
// Only do this when the result is at least 64 bits or we'll leaving
// dangling PACKSSDW nodes.
if (SVT == MVT::i8 && InSVT == MVT::i32) {
EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
DAG, Subtarget);
assert(Mid && "Failed to pack!");
SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
assert(V && "Failed to pack!");
return V;
} else if (SVT == MVT::i8 || Subtarget.hasSSE41())
return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
if (auto SSatVal = detectSSatPattern(In, VT))
return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
unsigned TruncOpc = 0;
SDValue SatVal;
if (auto SSatVal = detectSSatPattern(In, VT)) {
SatVal = SSatVal;
TruncOpc = X86ISD::VTRUNCS;
} else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
SatVal = USatVal;
TruncOpc = X86ISD::VTRUNCUS;
if (SatVal) {
unsigned ResElts = VT.getVectorNumElements();
// If the input type is less than 512 bits and we don't have VLX, we need
// to widen to 512 bits.
if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
unsigned NumConcats = 512 / InVT.getSizeInBits();
ResElts *= NumConcats;
SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
ConcatOps[0] = SatVal;
InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
NumConcats * InVT.getVectorNumElements());
SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
// Widen the result if its narrower than 128 bits.
if (ResElts * SVT.getSizeInBits() < 128)
ResElts = 128 / SVT.getSizeInBits();
EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
DAG.getIntPtrConstant(0, DL));
return SDValue();
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
/// X86ISD::AVG instruction.
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
if (!VT.isVector())
return SDValue();
EVT InVT = In.getValueType();
unsigned NumElems = VT.getVectorNumElements();
EVT ScalarVT = VT.getVectorElementType();
if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
NumElems >= 2 && isPowerOf2_32(NumElems)))
return SDValue();
// InScalarVT is the intermediate type in AVG pattern and it should be greater
// than the original input type (i8/i16).
EVT InScalarVT = InVT.getVectorElementType();
if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
return SDValue();
if (!Subtarget.hasSSE2())
return SDValue();
// Detect the following pattern:
// %1 = zext <N x i8> %a to <N x i32>
// %2 = zext <N x i8> %b to <N x i32>
// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
// %4 = add nuw nsw <N x i32> %3, %2
// %5 = lshr <N x i32> %N, <i32 1 x N>
// %6 = trunc <N x i32> %5 to <N x i8>
// In AVX512, the last instruction can also be a trunc store.
if (In.getOpcode() != ISD::SRL)
return SDValue();
// A lambda checking the given SDValue is a constant vector and each element
// is in the range [Min, Max].
auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
// Check if each element of the vector is right-shifted by one.
auto LHS = In.getOperand(0);
auto RHS = In.getOperand(1);
if (!IsConstVectorInRange(RHS, 1, 1))
return SDValue();
if (LHS.getOpcode() != ISD::ADD)
return SDValue();
// Detect a pattern of a + b + 1 where the order doesn't matter.
SDValue Operands[3];
Operands[0] = LHS.getOperand(0);
Operands[1] = LHS.getOperand(1);
auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
// Take care of the case when one of the operands is a constant vector whose
// element is in the range [1, 256].
if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
Operands[0].getOperand(0).getValueType() == VT) {
// The pattern is detected. Subtract one from the constant vector, then
// demote it and emit X86ISD::AVG instruction.
SDValue VecOnes = DAG.getConstant(1, DL, InVT);
Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
return SplitOpsAndApply(DAG, Subtarget, DL, VT,
{ Operands[0].getOperand(0), Operands[1] },
// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
// Match the or case only if its 'add-like' - can be replaced by an add.
auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
if (ISD::ADD == V.getOpcode()) {
Op0 = V.getOperand(0);
Op1 = V.getOperand(1);
return true;
if (ISD::ZERO_EXTEND != V.getOpcode())
return false;
V = V.getOperand(0);
if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
return false;
Op0 = V.getOperand(0);
Op1 = V.getOperand(1);
return true;
SDValue Op0, Op1;
if (FindAddLike(Operands[0], Op0, Op1))
std::swap(Operands[0], Operands[1]);
else if (!FindAddLike(Operands[1], Op0, Op1))
return SDValue();
Operands[2] = Op0;
Operands[1] = Op1;
// Now we have three operands of two additions. Check that one of them is a
// constant vector with ones, and the other two can be promoted from i8/i16.
for (int i = 0; i < 3; ++i) {
if (!IsConstVectorInRange(Operands[i], 1, 1))
std::swap(Operands[i], Operands[2]);
// Check if Operands[0] and Operands[1] are results of type promotion.
for (int j = 0; j < 2; ++j)
if (Operands[j].getValueType() != VT) {
if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
Operands[j].getOperand(0).getValueType() != VT)
return SDValue();
Operands[j] = Operands[j].getOperand(0);
// The pattern is detected, emit X86ISD::AVG instruction(s).
return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
return SDValue();
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// For chips with slow 32-byte unaligned loads, break the 32-byte operation
// into two 16-byte operations. Also split non-temporal aligned loads on
// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
Ld->getAlignment() >= 16) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
*Ld->getMemOperand(), &Fast) &&
!Fast))) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
unsigned HalfOffset = 16;
SDValue Ptr1 = Ld->getBasePtr();
SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems / 2);
SDValue Load1 =
DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1), Load2.getValue(1));
SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
return DCI.CombineTo(N, NewVec, TF, true);
// Bool vector load - attempt to cast to an integer, as we have good
// (vXiY *ext(vXi1 bitcast(iX))) handling.
if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
unsigned NumElts = RegVT.getVectorNumElements();
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
if (TLI.isTypeLegal(IntVT)) {
SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
// Cast ptr32 and ptr64 pointers to the default address space before a load.
unsigned AddrSpace = Ld->getAddressSpace();
if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
AddrSpace == X86AS::PTR32_UPTR) {
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
SDValue Cast =
DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
return SDValue();
/// If V is a build vector of boolean constants and exactly one of those
/// constants is true, return the operand index of that true element.
/// Otherwise, return -1.
static int getOneTrueElt(SDValue V) {
// This needs to be a build vector of booleans.
// TODO: Checking for the i1 type matches the IR definition for the mask,
// but the mask check could be loosened to i8 or other types. That might
// also require checking more than 'allOnesValue'; eg, the x86 HW
// instructions only require that the MSB is set for each mask element.
// The ISD::MSTORE comments/definition do not specify how the mask operand
// is formatted.
auto *BV = dyn_cast<BuildVectorSDNode>(V);
if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
return -1;
int TrueIndex = -1;
unsigned NumElts = BV->getValueType(0).getVectorNumElements();
for (unsigned i = 0; i < NumElts; ++i) {
const SDValue &Op = BV->getOperand(i);
if (Op.isUndef())
auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
if (!ConstNode)
return -1;
if (ConstNode->getAPIntValue().isAllOnesValue()) {
// If we already found a one, this is too many.
if (TrueIndex >= 0)
return -1;
TrueIndex = i;
return TrueIndex;
/// Given a masked memory load/store operation, return true if it has one mask
/// bit set. If it has one mask bit set, then also return the memory address of
/// the scalar element to load/store, the vector index to insert/extract that
/// scalar element, and the alignment for the scalar memory access.
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
SelectionDAG &DAG, SDValue &Addr,
SDValue &Index, unsigned &Alignment) {
int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
if (TrueMaskElt < 0)
return false;
// Get the address of the one scalar element that is specified by the mask
// using the appropriate offset from the base pointer.
EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
Addr = MaskedOp->getBasePtr();
if (TrueMaskElt != 0) {
unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
return true;
/// If exactly one element of the mask is set for a non-extending masked load,
/// it is a scalar load and vector insert.
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
assert(ML->isUnindexed() && "Unexpected indexed masked load!");
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
SDValue Addr, VecIndex;
unsigned Alignment;
if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
return SDValue();
// Load the one scalar element that is specified by the mask using the
// appropriate offset from the base pointer.
EVT VT = ML->getValueType(0);
EVT EltVT = VT.getVectorElementType();
SDValue Load =
DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
Alignment, ML->getMemOperand()->getFlags());
// Insert the loaded element into the appropriate place in the vector.
SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
ML->getPassThru(), Load, VecIndex);
return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
assert(ML->isUnindexed() && "Unexpected indexed masked load!");
if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
return SDValue();
EVT VT = ML->getValueType(0);
// If we are loading the first and last elements of a vector, it is safe and
// always faster to load the whole vector. Replace the masked load with a
// vector load and select.
unsigned NumElts = VT.getVectorNumElements();
BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
if (LoadFirstElt && LoadLastElt) {
SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
// Convert a masked load with a constant mask into a masked load and a select.
// This allows the select operation to use a faster kind of select instruction
// (for example, vblendvps -> vblendps).
// Don't try this if the pass-through operand is already undefined. That would
// cause an infinite loop because that's what we're about to create.
if (ML->getPassThru().isUndef())
return SDValue();
if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
return SDValue();
// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.
SDValue NewML = DAG.getMaskedLoad(
VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
ML->getAddressingMode(), ML->getExtensionType());
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
auto *Mld = cast<MaskedLoadSDNode>(N);
// TODO: Expanding load with constant mask may be optimized as well.
if (Mld->isExpandingLoad())
return SDValue();
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
return ScalarLoad;
// TODO: Do some AVX512 subsets benefit from this transform?
if (!Subtarget.hasAVX512())
if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
return Blend;
// If the mask value has been legalized to a non-boolean vector, try to
// simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mld->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
EVT VT = Mld->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
if (SDValue NewMask =
TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
return DAG.getMaskedLoad(
VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
Mld->getAddressingMode(), Mld->getExtensionType());
return SDValue();
/// If exactly one element of the mask is set for a non-truncating masked store,
/// it is a vector extract and scalar store.
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
SelectionDAG &DAG) {
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
SDValue Addr, VecIndex;
unsigned Alignment;
if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
return SDValue();
// Extract the one scalar element that is actually being stored.
EVT VT = MS->getValue().getValueType();
EVT EltVT = VT.getVectorElementType();
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
MS->getValue(), VecIndex);
// Store that element at the appropriate offset from the base pointer.
return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
Alignment, MS->getMemOperand()->getFlags());
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
if (Mst->isCompressingStore())
return SDValue();
EVT VT = Mst->getValue().getValueType();
SDLoc dl(Mst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Mst->isTruncatingStore())
return SDValue();
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
return ScalarStore;
// If the mask value has been legalized to a non-boolean vector, try to
// simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
if (SDValue NewMask =
TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
Mst->getBasePtr(), Mst->getOffset(), NewMask,
Mst->getMemoryVT(), Mst->getMemOperand(),
SDValue Value = Mst->getValue();
if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
Mst->getMemoryVT())) {
return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
Mst->getBasePtr(), Mst->getOffset(), Mask,
Mst->getMemoryVT(), Mst->getMemOperand(),
Mst->getAddressingMode(), true);
return SDValue();
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Convert a store of vXi1 into a store of iX and a bitcast.
if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
VT.getVectorElementType() == MVT::i1) {
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
StoredVal = DAG.getBitcast(NewVT, StoredVal);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
// This will avoid a copy to k-register.
if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
StoredVal.getOperand(0).getValueType() == MVT::i8) {
return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
St->getBasePtr(), St->getPointerInfo(),
// Widen v2i1/v4i1 stores to v8i1.
if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
Subtarget.hasAVX512()) {
unsigned NumConcats = 8 / VT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
Ops[0] = StoredVal;
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
// Turn vXi1 stores of constants into a scalar store.
if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
// If its a v64i1 store without 64-bit support, we need two stores.
if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
StoredVal->ops().slice(0, 32));
Lo = combinevXi1ConstantToInteger(Lo, DAG);
SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
StoredVal->ops().slice(32, 32));
Hi = combinevXi1ConstantToInteger(Hi, DAG);
SDValue Ptr0 = St->getBasePtr();
SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
SDValue Ch0 =
DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
SDValue Ch1 =
DAG.getStore(St->getChain(), dl, Hi, Ptr1,
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
// If we are saving a 32-byte vector and 32-byte stores are slow, such as on
// Sandy Bridge, perform two 16-byte stores.
bool Fast;
if (VT.is256BitVector() && StVT == VT &&
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
*St->getMemOperand(), &Fast) &&
!Fast) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
return splitVectorStore(St, DAG);
// Split under-aligned vector non-temporal stores.
if (St->isNonTemporal() && StVT == VT &&
St->getAlignment() < VT.getStoreSize()) {
// ZMM/YMM nt-stores - either it can be stored as a series of shorter
// vectors or the legalizer can scalarize it to use MOVNTI.
if (VT.is256BitVector() || VT.is512BitVector()) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
return splitVectorStore(St, DAG);
// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
// to use MOVNTI.
if (VT.is128BitVector() && Subtarget.hasSSE2()) {
MVT NTVT = Subtarget.hasSSE4A()
? MVT::v2f64
: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
return scalarizeVectorStore(St, NTVT, DAG);
// Try to optimize v16i16->v16i8 truncating stores when BWI is not
// supported, but avx512f is by extending to v16i32 and truncating.
if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
St->getValue().getOpcode() == ISD::TRUNCATE &&
St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
MVT::v16i8, St->getMemOperand());
// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
return EmitTruncSStore(IsSigned, St->getChain(),
dl, StoredVal.getOperand(0), St->getBasePtr(),
VT, St->getMemOperand(), DAG);
// Optimize trunc store (of multiple scalars) to shuffle and store.
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
if (St->isTruncatingStore() && VT.isVector()) {
// Check if we can detect an AVG pattern from the truncation. If yes,
// replace the trunc store by a normal store with the result of X86ISD::AVG
// instruction.
if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
Subtarget, dl))
return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
if (TLI.isTruncStoreLegal(VT, StVT)) {
if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
dl, Val, St->getBasePtr(),
St->getMemoryVT(), St->getMemOperand(), DAG);
if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
DAG, dl))
return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
dl, Val, St->getBasePtr(),
St->getMemoryVT(), St->getMemOperand(), DAG);
return SDValue();
// Cast ptr32 and ptr64 pointers to the default address space before a store.
unsigned AddrSpace = St->getAddressSpace();
if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
AddrSpace == X86AS::PTR32_UPTR) {
MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
if (PtrVT != St->getBasePtr().getSimpleValueType()) {
SDValue Cast =
DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags(), St->getAAInfo());
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
// places to insert EMMS. This qualifies as a quick hack.
// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
if (VT.getSizeInBits() != 64)
return SDValue();
const Function &F = DAG.getMachineFunction().getFunction();
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
isa<LoadSDNode>(St->getValue()) &&
cast<LoadSDNode>(St->getValue())->isSimple() &&
St->getChain().hasOneUse() && St->isSimple()) {
LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
if (!ISD::isNormalLoad(Ld))
return SDValue();
// Avoid the transformation if there are multiple uses of the loaded value.
if (!Ld->hasNUsesOfValue(1, 0))
return SDValue();
SDLoc LdDL(Ld);
SDLoc StDL(N);
// Lower to a single movq load/store pair.
SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
Ld->getBasePtr(), Ld->getMemOperand());
// Make sure new load is placed in same chain order.
DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
// This is similar to the above case, but here we handle a scalar 64-bit
// integer store that is extracted from a vector on a 32-bit target.
// If we have SSE2, then we can treat it like a floating-point double
// to get past legalization. The execution dependencies fixup pass will
// choose the optimal machine instruction for the store if this really is
// an integer or v2f32 rather than an f64.
if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue OldExtract = St->getOperand(1);
SDValue ExtOp0 = OldExtract.getOperand(0);
unsigned VecSize = ExtOp0.getValueSizeInBits();
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
BitCast, OldExtract.getOperand(1));
return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
return SDValue();
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
auto *St = cast<MemIntrinsicSDNode>(N);
SDValue StoredVal = N->getOperand(1);
MVT VT = StoredVal.getSimpleValueType();
EVT MemVT = St->getMemoryVT();
// Figure out which elements we demand.
unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
KnownZero, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
return SDValue();
/// Return 'true' if this vector operation is "horizontal"
/// and return the operands for the horizontal operation in LHS and RHS. A
/// horizontal operation performs the binary operation on successive elements
/// of its first operand, then on successive elements of its second operand,
/// returning the resulting values in a vector. For example, if
/// A = < float a0, float a1, float a2, float a3 >
/// and
/// B = < float b0, float b1, float b2, float b3 >
/// then the result of doing a horizontal operation on A and B is
/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- bool IsCommutative) {
+ const X86Subtarget &Subtarget, bool IsCommutative,
+ SmallVectorImpl<int> &PostShuffleMask) {
// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() || RHS.isUndef())
return false;
// Look for the following pattern:
// A = < float a0, float a1, float a2, float a3 >
// B = < float b0, float b1, float b2, float b3 >
// and
// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
// which is A horizontal-op B.
MVT VT = LHS.getSimpleValueType();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
unsigned NumElts = VT.getVectorNumElements();
// TODO - can we make a general helper method that does all of this for us?
auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
SmallVectorImpl<int> &ShuffleMask) {
if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
if (!Op.getOperand(0).isUndef())
N0 = Op.getOperand(0);
if (!Op.getOperand(1).isUndef())
N1 = Op.getOperand(1);
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
ShuffleMask.append(Mask.begin(), Mask.end());
bool UseSubVector = false;
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Op.getOperand(0).getValueType().is256BitVector() &&
llvm::isNullConstant(Op.getOperand(1))) {
Op = Op.getOperand(0);
UseSubVector = true;
bool IsUnary;
SmallVector<SDValue, 2> SrcOps;
SmallVector<int, 16> SrcShuffleMask;
SDValue BC = peekThroughBitcasts(Op);
if (isTargetShuffle(BC.getOpcode()) &&
getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
SrcOps, SrcShuffleMask, IsUnary)) {
if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
SrcOps.size() <= 2) {
N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
SrcOps.size() == 1) {
N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
ShuffleMask.append(Mask.begin(), Mask.end());
// View LHS in the form
// If LHS is not a shuffle, then pretend it is the identity shuffle:
// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
// NOTE: A default initialized SDValue represents an UNDEF of type VT.
SDValue A, B;
SmallVector<int, 16> LMask;
GetShuffle(LHS, A, B, LMask);
// Likewise, view RHS in the form
SDValue C, D;
SmallVector<int, 16> RMask;
GetShuffle(RHS, C, D, RMask);
// At least one of the operands should be a vector shuffle.
unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
if (NumShuffles == 0)
return false;
if (LMask.empty()) {
A = LHS;
for (unsigned i = 0; i != NumElts; ++i)
if (RMask.empty()) {
C = RHS;
for (unsigned i = 0; i != NumElts; ++i)
+ // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
+ if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+ (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
+ isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
+ return false;
// If A and B occur in reverse order in RHS, then canonicalize by commuting
// RHS operands and shuffle mask.
if (A != C) {
std::swap(C, D);
// Check that the shuffles are both shuffling the same vectors.
if (!(A == C && B == D))
return false;
+ PostShuffleMask.clear();
+ PostShuffleMask.append(NumElts, SM_SentinelUndef);
// LHS and RHS are now:
// LHS = shuffle A, B, LMask
// RHS = shuffle A, B, RMask
// Check that the masks correspond to performing a horizontal operation.
// AVX defines horizontal add/sub to operate independently on 128-bit lanes,
// so we just repeat the inner loop if this is a 256-bit op.
unsigned Num128BitChunks = VT.getSizeInBits() / 128;
unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
+ unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
assert((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane");
for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
// Ignore undefined components.
int LIdx = LMask[i + j], RIdx = RMask[i + j];
if (LIdx < 0 || RIdx < 0 ||
(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+ // Check that successive odd/even elements are being operated on. If not,
+ // this is not a horizontal operation.
+ if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
+ !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
+ return false;
+ // Compute the post-shuffle mask index based on where the element
+ // is stored in the HOP result, and where it needs to be moved to.
+ int Base = LIdx & ~1u;
+ int Index = ((Base % NumEltsPer128BitChunk) / 2) +
+ ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
// The low half of the 128-bit result must choose from A.
// The high half of the 128-bit result must choose from B,
// unless B is undef. In that case, we are always choosing from A.
- unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
- unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
- // Check that successive elements are being operated on. If not, this is
- // not a horizontal operation.
- int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
- if (!(LIdx == Index && RIdx == Index + 1) &&
- !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
- return false;
+ if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
+ Index += NumEltsPer64BitChunk;
+ PostShuffleMask[i + j] = Index;
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
- if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
+ bool IsIdentityPostShuffle =
+ isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
+ if (IsIdentityPostShuffle)
+ PostShuffleMask.clear();
+ // Assume a SingleSource HOP if we only shuffle one input and don't need to
+ // shuffle the result.
+ if (!shouldUseHorizontalOp(LHS == RHS &&
+ (NumShuffles < 2 || !IsIdentityPostShuffle),
+ DAG, Subtarget))
return false;
LHS = DAG.getBitcast(VT, LHS);
RHS = DAG.getBitcast(VT, RHS);
return true;
/// Do target-specific dag combines on floating-point adds/subs.
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
bool IsFadd = N->getOpcode() == ISD::FADD;
auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
+ SmallVector<int, 8> PostShuffleMask;
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
- return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
+ isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
+ SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
+ if (!PostShuffleMask.empty())
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+ DAG.getUNDEF(VT), PostShuffleMask);
+ return HorizBinOp;
+ }
// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
return SDValue();
/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
/// the codegen.
/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
/// anything that is guaranteed to be transformed by DAGCombiner.
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
SDValue Src = N->getOperand(0);
unsigned SrcOpcode = Src.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
auto IsFreeTruncation = [VT](SDValue Op) {
unsigned TruncSizeInBits = VT.getScalarSizeInBits();
// See if this has been extended from a smaller/equal size to
// the truncation size, allowing a truncation to combine with the extend.
unsigned Opcode = Op.getOpcode();
if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
Opcode == ISD::ZERO_EXTEND) &&
Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
return true;
// See if this is a single use constant which can be constant folded.
// NOTE: We don't peek throught bitcasts here because there is currently
// no support for constant folding truncate+bitcast+vector_of_constants. So
// we'll just send up with a truncate on both operands which will
// get turned back into (truncate (binop)) causing an infinite loop.
return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
// Don't combine if the operation has other uses.
if (!Src.hasOneUse())
return SDValue();
// Only support vector truncation for now.
// TODO: i64 scalar math would benefit as well.
if (!VT.isVector())
return SDValue();
// In most cases its only worth pre-truncating if we're only facing the cost
// of one truncation.
// i.e. if one of the inputs will constant fold or the input is repeated.
switch (SrcOpcode) {
case ISD::MUL:
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
if (SrcVT.getScalarType() == MVT::i64 &&
TLI.isOperationLegal(SrcOpcode, VT) &&
!TLI.isOperationLegal(SrcOpcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
case ISD::AND:
case ISD::XOR:
case ISD::OR:
case ISD::ADD:
case ISD::SUB: {
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(SrcOpcode, VT) &&
(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
return SDValue();
/// Truncate using ISD::AND mask and X86ISD::PACKUS.
/// e.g. trunc <8 x i32> X to <8 x i16> -->
/// MaskX = X & 0xffff (clear high bits to prevent saturation)
/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
EVT OutVT = N->getValueType(0);
APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
EVT OutVT = N->getValueType(0);
return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
/// legalization the truncation will be translated into a BUILD_VECTOR with each
/// element that is extracted from a vector and then truncated, and it is
/// difficult to do this optimization based on them.
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT OutVT = N->getValueType(0);
if (!OutVT.isVector())
return SDValue();
SDValue In = N->getOperand(0);
if (!In.getValueType().isSimple())
return SDValue();
EVT InVT = In.getValueType();
unsigned NumElems = OutVT.getVectorNumElements();
// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
// SSE2, and we need to take care of it specially.
// AVX512 provides vpmovdb.
if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
return SDValue();
EVT OutSVT = OutVT.getVectorElementType();
EVT InSVT = InVT.getVectorElementType();
if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
NumElems >= 8))
return SDValue();
// SSSE3's pshufb results in less instructions in the cases below.
if (Subtarget.hasSSSE3() && NumElems == 8 &&
((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
(InSVT == MVT::i32 && OutSVT == MVT::i16)))
return SDValue();
SDLoc DL(N);
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
// truncate 2 x v4i32 to v8i16.
if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
if (InSVT == MVT::i32)
return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
return SDValue();
/// This function transforms vector truncation of 'extended sign-bits' or
/// 'extended zero-bits' values.
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Requires SSE2.
if (!Subtarget.hasSSE2())
return SDValue();
if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
return SDValue();
SDValue In = N->getOperand(0);
if (!In.getValueType().isSimple())
return SDValue();
MVT VT = N->getValueType(0).getSimpleVT();
MVT SVT = VT.getScalarType();
MVT InVT = In.getValueType().getSimpleVT();
MVT InSVT = InVT.getScalarType();
// Check we have a truncation suited for PACKSS/PACKUS.
if (!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
return SDValue();
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
// Truncation to sub-128bit vXi32 can be better handled with shuffles.
if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
return SDValue();
// AVX512 has fast truncate, but if the input is already going to be split,
// there's no harm in trying pack.
if (Subtarget.hasAVX512() &&
!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
return SDValue();
unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
// Use PACKUS if the input has zero-bits that extend all the way to the
// packed/truncated value. e.g. masks, zext_in_reg, etc.
KnownBits Known = DAG.computeKnownBits(In);
unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
// Use PACKSS if the input has sign-bits that extend all the way to the
// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
unsigned NumSignBits = DAG.ComputeNumSignBits(In);
// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
// on and combines/simplifications can't then use it.
if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
return SDValue();
if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
return SDValue();
// Try to form a MULHU or MULHS node by looking for
// (trunc (srl (mul ext, ext), 16))
// TODO: This is X86 specific because we want to be able to handle wide types
// before type legalization. But we can only do it if the vector will be
// legalized via widening/splitting. Type legalization can't handle promotion
// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
// combiner.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// First instruction should be a right shift of a multiply.
if (Src.getOpcode() != ISD::SRL ||
Src.getOperand(0).getOpcode() != ISD::MUL)
return SDValue();
if (!Subtarget.hasSSE2())
return SDValue();
// Only handle vXi16 types that are at least 128-bits unless they will be
// widened.
if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
return SDValue();
// Input type should be at least vXi32.
EVT InVT = Src.getValueType();
if (InVT.getVectorElementType().getSizeInBits() < 32)
return SDValue();
// Need a shift by 16.
APInt ShiftAmt;
if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
ShiftAmt != 16)
return SDValue();
SDValue LHS = Src.getOperand(0).getOperand(0);
SDValue RHS = Src.getOperand(0).getOperand(1);
unsigned ExtOpc = LHS.getOpcode();
if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
RHS.getOpcode() != ExtOpc)
return SDValue();
// Peek through the extends.
LHS = LHS.getOperand(0);
RHS = RHS.getOperand(0);
// Ensure the input types match.
if (LHS.getValueType() != VT || RHS.getValueType() != VT)
return SDValue();
unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
return DAG.getNode(Opc, DL, VT, LHS, RHS);
// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
// from one vector with signed bytes from another vector, adds together
// adjacent pairs of 16-bit products, and saturates the result before
// truncating to 16-bits.
// Which looks something like this:
// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &DL) {
if (!VT.isVector() || !Subtarget.hasSSSE3())
return SDValue();
unsigned NumElems = VT.getVectorNumElements();
EVT ScalarVT = VT.getVectorElementType();
if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
return SDValue();
SDValue SSatVal = detectSSatPattern(In, VT);
if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
return SDValue();
// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
// of multiplies from even/odd elements.
SDValue N0 = SSatVal.getOperand(0);
SDValue N1 = SSatVal.getOperand(1);
if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
SDValue N10 = N1.getOperand(0);
SDValue N11 = N1.getOperand(1);
// TODO: Handle constant vectors and use knownbits/computenumsignbits?
// Canonicalize zero_extend to LHS.
if (N01.getOpcode() == ISD::ZERO_EXTEND)
std::swap(N00, N01);
if (N11.getOpcode() == ISD::ZERO_EXTEND)
std::swap(N10, N11);
// Ensure we have a zero_extend and a sign_extend.
if (N00.getOpcode() != ISD::ZERO_EXTEND ||
N01.getOpcode() != ISD::SIGN_EXTEND ||
N10.getOpcode() != ISD::ZERO_EXTEND ||
N11.getOpcode() != ISD::SIGN_EXTEND)
return SDValue();
// Peek through the extends.
N00 = N00.getOperand(0);
N01 = N01.getOperand(0);
N10 = N10.getOperand(0);
N11 = N11.getOperand(0);
// Ensure the extend is from vXi8.
if (N00.getValueType().getVectorElementType() != MVT::i8 ||
N01.getValueType().getVectorElementType() != MVT::i8 ||
N10.getValueType().getVectorElementType() != MVT::i8 ||
N11.getValueType().getVectorElementType() != MVT::i8)
return SDValue();
// All inputs should be build_vectors.
if (N00.getOpcode() != ISD::BUILD_VECTOR ||
N01.getOpcode() != ISD::BUILD_VECTOR ||
N10.getOpcode() != ISD::BUILD_VECTOR ||
N11.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// N00/N10 are zero extended. N01/N11 are sign extended.
// For each element, we need to ensure we have an odd element from one vector
// multiplied by the odd element of another vector and the even element from
// one of the same vectors being multiplied by the even element from the
// other vector. So we need to make sure for each element i, this operator
// is being performed:
// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
SDValue ZExtIn, SExtIn;
for (unsigned i = 0; i != NumElems; ++i) {
SDValue N00Elt = N00.getOperand(i);
SDValue N01Elt = N01.getOperand(i);
SDValue N10Elt = N10.getOperand(i);
SDValue N11Elt = N11.getOperand(i);
// TODO: Be more tolerant to undefs.
if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
return SDValue();
unsigned IdxN00 = ConstN00Elt->getZExtValue();
unsigned IdxN01 = ConstN01Elt->getZExtValue();
unsigned IdxN10 = ConstN10Elt->getZExtValue();
unsigned IdxN11 = ConstN11Elt->getZExtValue();
// Add is commutative so indices can be reordered.
if (IdxN00 > IdxN10) {
std::swap(IdxN00, IdxN10);
std::swap(IdxN01, IdxN11);
// N0 indices be the even element. N1 indices must be the next odd element.
if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
return SDValue();
SDValue N00In = N00Elt.getOperand(0);
SDValue N01In = N01Elt.getOperand(0);
SDValue N10In = N10Elt.getOperand(0);
SDValue N11In = N11Elt.getOperand(0);
// First time we find an input capture it.
if (!ZExtIn) {
ZExtIn = N00In;
SExtIn = N01In;
if (ZExtIn != N00In || SExtIn != N01In ||
ZExtIn != N10In || SExtIn != N11In)
return SDValue();
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
EVT InVT = Ops[0].getValueType();
assert(InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type");
assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
InVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
SDLoc DL(N);
// Attempt to pre-truncate inputs to arithmetic ops instead.
if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
return V;
// Try to detect AVG pattern first.
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
// Try to detect PMADD
if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
return PMAdd;
// Try to combine truncation with signed/unsigned saturation.
if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
return Val;
// Try to combine PMULHUW/PMULHW for vXi16.
if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
return V;
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
SDValue BCSrc = Src.getOperand(0);
if (BCSrc.getValueType() == MVT::x86mmx)
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
return V;
return combineVectorTruncation(N, DAG, Subtarget);
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
SDLoc DL(N);
if (auto SSatVal = detectSSatPattern(In, VT))
return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
return SDValue();
/// Returns the negated value if the node \p N flips sign of FP value.
/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
/// or FSUB(0, x)
/// AVX512F does not have FXOR, so FNEG is lowered as
/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
/// In this case we go though all bitcasts.
/// This also recognizes splat of a negated value and returns the splat of that
/// value.
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
if (N->getOpcode() == ISD::FNEG)
return N->getOperand(0);
// Don't recurse exponentially.
if (Depth > SelectionDAG::MaxRecursionDepth)
return SDValue();
unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
EVT VT = Op->getValueType(0);
// Make sure the element size doesn't change.
if (VT.getScalarSizeInBits() != ScalarSize)
return SDValue();
unsigned Opc = Op.getOpcode();
switch (Opc) {
// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
if (!Op.getOperand(1).isUndef())
return SDValue();
if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
// -V, INDEX).
SDValue InsVector = Op.getOperand(0);
SDValue InsVal = Op.getOperand(1);
if (!InsVector.isUndef())
return SDValue();
if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
NegInsVal, Op.getOperand(2));
case ISD::FSUB:
case ISD::XOR:
case X86ISD::FXOR: {
SDValue Op1 = Op.getOperand(1);
SDValue Op0 = Op.getOperand(0);
// For XOR and FXOR, we want to check if constant
// bits of Op1 are sign bit masks. For FSUB, we
// have to check if constant bits of Op0 are sign
// bit masks and hence we swap the operands.
if (Opc == ISD::FSUB)
std::swap(Op0, Op1);
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
// Extract constant bits and see if they are all
// sign bit masks. Ignore the undef elements.
if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
/* AllowWholeUndefs */ true,
/* AllowPartialUndefs */ false)) {
for (unsigned I = 0, E = EltBits.size(); I < E; I++)
if (!UndefElts[I] && !EltBits[I].isSignMask())
return SDValue();
return peekThroughBitcasts(Op0);
return SDValue();
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
bool NegRes) {
if (NegMul) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMADD; break;
case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FNMADD: Opcode = ISD::FMA; break;
case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
if (NegAcc) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FMSUB; break;
case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
case X86ISD::FMSUB: Opcode = ISD::FMA; break;
case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
if (NegRes) {
switch (Opcode) {
// For accuracy reason, we never combine fneg and fma under strict FP.
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
return Opcode;
/// Do target-specific dag combines on floating point negations.
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT OrigVT = N->getValueType(0);
SDValue Arg = isFNEG(DAG, N);
if (!Arg)
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = Arg.getValueType();
EVT SVT = VT.getScalarType();
SDLoc DL(N);
// Let legalize expand this if it isn't a legal type yet.
if (!TLI.isTypeLegal(VT))
return SDValue();
// If we're negating a FMUL node on a target with FMA, then we can avoid the
// use of a constant by performing (-0 - A*B) instead.
// FIXME: Check rounding control flags as well once it becomes available.
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
Arg.getOperand(1), Zero);
return DAG.getBitcast(OrigVT, NewNode);
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool LegalOperations = !DCI.isBeforeLegalizeOps();
if (SDValue NegArg =
TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
return DAG.getBitcast(OrigVT, NegArg);
return SDValue();
SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
bool LegalOperations,
bool ForCodeSize,
NegatibleCost &Cost,
unsigned Depth) const {
// fneg patterns are removable even if they have multiple uses.
if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
Cost = NegatibleCost::Cheaper;
return DAG.getBitcast(Op.getValueType(), Arg);
EVT VT = Op.getValueType();
EVT SVT = VT.getScalarType();
unsigned Opc = Op.getOpcode();
switch (Opc) {
case ISD::FMA:
case X86ISD::FMSUB:
case X86ISD::FNMADD:
case X86ISD::FNMSUB:
case X86ISD::FNMSUB_RND: {
if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
!(SVT == MVT::f32 || SVT == MVT::f64) ||
!isOperationLegal(ISD::FMA, VT))
// This is always negatible for free but we might be able to remove some
// extra operand negations as well.
SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
for (int i = 0; i != 3; ++i)
NewOps[i] = getCheaperNegatedExpression(
Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
bool NegA = !!NewOps[0];
bool NegB = !!NewOps[1];
bool NegC = !!NewOps[2];
unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
: NegatibleCost::Neutral;
// Fill in the non-negated ops with the original values.
for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
if (!NewOps[i])
NewOps[i] = Op.getOperand(i);
return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
case X86ISD::FRCP:
if (SDValue NegOp0 =
getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
ForCodeSize, Cost, Depth + 1))
return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
ForCodeSize, Cost, Depth);
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
// If we have integer vector types available, use the integer opcodes.
if (!VT.isVector() || !Subtarget.hasSSE2())
return SDValue();
SDLoc dl(N);
unsigned IntBits = VT.getScalarSizeInBits();
MVT IntSVT = MVT::getIntegerVT(IntBits);
MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
unsigned IntOpcode;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected FP logic op");
case X86ISD::FOR: IntOpcode = ISD::OR; break;
case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
case X86ISD::FAND: IntOpcode = ISD::AND; break;
case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
return DAG.getBitcast(VT, IntOp);
/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() != ISD::XOR)
return SDValue();
SDValue LHS = N->getOperand(0);
if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
return SDValue();
X86::CondCode NewCC = X86::GetOppositeBranchCondition(
SDLoc DL(N);
return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
// If this is SSE1 only convert to FXOR to avoid scalarization.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
N->getValueType(0) == MVT::v4i32) {
return DAG.getBitcast(
MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
return Cmp;
if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
return R;
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue SetCC = foldXor1SetCC(N, DAG))
return SetCC;
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
return RV;
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
return combineFneg(N, DAG, DCI, Subtarget);
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
unsigned NumBits = VT.getSizeInBits();
// TODO - Constant Folding.
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getAllOnesValue(NumBits));
if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
return SDValue();
static bool isNullFPScalarOrVectorConst(SDValue V) {
return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
/// If a value is a scalar FP zero or a vector FP zero (potentially including
/// undefined elements), return a zero constant that may be used to fold away
/// that value. In the case of a vector, the returned constant will not contain
/// undefined elements even if the input parameter does. This makes it suitable
/// to be used as a replacement operand with operations (eg, bitwise-and) where
/// an undef should not propagate.
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!isNullFPScalarOrVectorConst(V))
return SDValue();
if (V.getValueType().isVector())
return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
return V;
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
SDLoc DL(N);
// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::f64 && Subtarget.hasSSE2()) ||
(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
return SDValue();
auto isAllOnesConstantFP = [](SDValue V) {
if (V.getSimpleValueType().isVector())
return ISD::isBuildVectorAllOnes(V.getNode());
auto *C = dyn_cast<ConstantFPSDNode>(V);
return C && C->getConstantFPValue()->isAllOnesValue();
// fand (fxor X, -1), Y --> fandn X, Y
if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
// fand X, (fxor Y, -1) --> fandn Y, X
if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
return SDValue();
/// Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// FAND(0.0, x) -> 0.0
if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
return V;
// FAND(x, 0.0) -> 0.0
if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
return V;
if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
return V;
return lowerX86FPLogicOp(N, DAG, Subtarget);
/// Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// FANDN(0.0, x) -> x
if (isNullFPScalarOrVectorConst(N->getOperand(0)))
return N->getOperand(1);
// FANDN(x, 0.0) -> 0.0
if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
return V;
return lowerX86FPLogicOp(N, DAG, Subtarget);
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
// F[X]OR(0.0, x) -> x
if (isNullFPScalarOrVectorConst(N->getOperand(0)))
return N->getOperand(1);
// F[X]OR(x, 0.0) -> x
if (isNullFPScalarOrVectorConst(N->getOperand(1)))
return N->getOperand(0);
if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
return NewVal;
return lowerX86FPLogicOp(N, DAG, Subtarget);
/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
if (!DAG.getTarget().Options.NoNaNsFPMath ||
return SDValue();
// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
// into FMINC and FMAXC, which are Commutative operations.
unsigned NewOp = 0;
switch (N->getOpcode()) {
default: llvm_unreachable("unknown opcode");
case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
N->getOperand(0), N->getOperand(1));
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (Subtarget.useSoftFloat())
return SDValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = N->getValueType(0);
if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
(Subtarget.hasSSE2() && VT == MVT::f64) ||
(VT.isVector() && TLI.isTypeLegal(VT))))
return SDValue();
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDLoc DL(N);
auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
// If we don't have to respect NaN inputs, this is a direct translation to x86
// min/max instructions.
if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
// If one of the operands is known non-NaN use the native min/max instructions
// with the non-NaN input as second operand.
if (DAG.isKnownNeverNaN(Op1))
return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
if (DAG.isKnownNeverNaN(Op0))
return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
// If we have to respect NaN inputs, this takes at least 3 instructions.
// Favor a library call when operating on a scalar and minimizing code size.
if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
// There are 4 possibilities involving NaN inputs, and these are the required
// outputs:
// Op1
// Num NaN
// ----------------
// Num | Max | Op0 |
// Op0 ----------------
// NaN | Op1 | NaN |
// ----------------
// The SSE FP max/min instructions were not designed for this case, but rather
// to implement:
// Min = Op1 < Op0 ? Op1 : Op0
// Max = Op1 > Op0 ? Op1 : Op0
// So they always return Op0 if either input is a NaN. However, we can still
// use those instructions for fmaxnum by selecting away a NaN input.
// If either operand is NaN, the 2nd source operand (Op0) is passed through.
SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
// are NaN, the NaN value of Op1 is the result.
return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt KnownUndef, KnownZero;
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
KnownZero, DCI))
return SDValue(N, 0);
// Convert a full vector load into vzload when not all bits are needed.
SDValue In = N->getOperand(0);
MVT InVT = In.getSimpleValueType();
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
MVT MemVT = MVT::getIntegerVT(NumBits);
MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
SDLoc dl(N);
SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return SDValue(N, 0);
return SDValue();
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
bool IsStrict = N->isTargetStrictFPOpcode();
EVT VT = N->getValueType(0);
// Convert a full vector load into vzload when not all bits are needed.
SDValue In = N->getOperand(IsStrict ? 1 : 0);
MVT InVT = In.getSimpleValueType();
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(In);
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
MVT MemVT = MVT::getFloatingPointVT(NumBits);
MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
SDLoc dl(N);
if (IsStrict) {
SDValue Convert =
DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
DCI.CombineTo(N, Convert, Convert.getValue(1));
} else {
SDValue Convert =
DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return SDValue(N, 0);
return SDValue();
/// Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
// ANDNP(0, x) -> x
if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
return N->getOperand(1);
// ANDNP(x, 0) -> 0
if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
// Turn ANDNP back to AND if input is inverted.
if (SDValue Not = IsNOT(N->getOperand(0), DAG))
return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
// Attempt to recursively combine a bitmask ANDNP with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue N1 = N->getOperand(1);
// BT ignores high bits in the bit index operand.
unsigned BitWidth = N1.getValueSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
return SDValue();
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getLowBitsSet(8, 4);
if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
// Convert a full vector load into vzload when not all bits are needed.
if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
SDLoc dl(N);
if (IsStrict) {
SDValue Convert = DAG.getNode(
N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
DCI.CombineTo(N, Convert, Convert.getValue(1));
} else {
SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
DAG.getBitcast(MVT::v8i16, VZLoad));
DCI.CombineTo(N, Convert);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
return SDValue(N, 0);
return SDValue();
// Try to combine sext_in_reg of a cmov of constants by extending the constants.
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
EVT DstVT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
return SDValue();
// Look through single use any_extends / truncs.
SDValue IntermediateBitwidthOp;
if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
N0.hasOneUse()) {
IntermediateBitwidthOp = N0;
N0 = N0.getOperand(0);
// See if we have a single use cmov.
if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
return SDValue();
SDValue CMovOp0 = N0.getOperand(0);
SDValue CMovOp1 = N0.getOperand(1);
// Make sure both operands are constants.
if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
return SDValue();
SDLoc DL(N);
// If we looked through an any_extend/trunc above, add one to the constants.
if (IntermediateBitwidthOp) {
unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
if (DstVT == MVT::i16) {
CMovVT = MVT::i32;
CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
N0.getOperand(2), N0.getOperand(3));
if (CMovVT != DstVT)
CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
return CMov;
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
if (SDValue V = combineSextInRegCmov(N, DAG))
return V;
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
SDLoc dl(N);
// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
// both SSE and AVX2 since there is no sign-extended shift right
// operation on a vector with 64-bit elements.
//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
N0.getOpcode() == ISD::SIGN_EXTEND)) {
SDValue N00 = N0.getOperand(0);
// EXTLOAD has a better solution on AVX2,
// it may be replaced with X86ISD::VSEXT node.
if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
if (!ISD::isNormalLoad(N00.getNode()))
return SDValue();
// Attempt to promote any comparison mask ops before moving the
// SIGN_EXTEND_INREG in the way.
if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
SDValue Tmp =
DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
return SDValue();
/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
/// opportunities to combine math ops, use an LEA, or use a complex addressing
/// mode. This can eliminate extend, add, and shift instructions.
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
Ext->getOpcode() != ISD::ZERO_EXTEND)
return SDValue();
// TODO: This should be valid for other integer types.
EVT VT = Ext->getValueType(0);
if (VT != MVT::i64)
return SDValue();
SDValue Add = Ext->getOperand(0);
if (Add.getOpcode() != ISD::ADD)
return SDValue();
bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
bool NSW = Add->getFlags().hasNoSignedWrap();
bool NUW = Add->getFlags().hasNoUnsignedWrap();
// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
// into the 'zext'
if ((Sext && !NSW) || (!Sext && !NUW))
return SDValue();
// Having a constant operand to the 'add' ensures that we are not increasing
// the instruction count because the constant is extended for free below.
// A constant operand can also become the displacement field of an LEA.
auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
if (!AddOp1)
return SDValue();
// Don't make the 'add' bigger if there's no hope of combining it with some
// other 'add' or 'shl' instruction.
// TODO: It may be profitable to generate simpler LEA instructions in place
// of single 'add' instructions, but the cost model for selecting an LEA
// currently has a high threshold.
bool HasLEAPotential = false;
for (auto *User : Ext->uses()) {
if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
HasLEAPotential = true;
if (!HasLEAPotential)
return SDValue();
// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
SDValue AddOp0 = Add.getOperand(0);
SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
// The wider add is guaranteed to not wrap because both operands are
// sign-extended.
SDNodeFlags Flags;
return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
// operands and the result of CMOV is not used anywhere else - promote CMOV
// itself instead of promoting its result. This could be beneficial, because:
// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
// (or more) pseudo-CMOVs only when they go one-after-another and
// getting rid of result extension code after CMOV will help that.
// 2) Promotion of constant CMOV arguments is free, hence the
// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
// promotion is also good in terms of code-size.
// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
// promotion).
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
SDValue CMovN = Extend->getOperand(0);
if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
return SDValue();
EVT TargetVT = Extend->getValueType(0);
unsigned ExtendOpcode = Extend->getOpcode();
SDLoc DL(Extend);
EVT VT = CMovN.getValueType();
SDValue CMovOp0 = CMovN.getOperand(0);
SDValue CMovOp1 = CMovN.getOperand(1);
if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
return SDValue();
// Only extend to i32 or i64.
if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
return SDValue();
// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
// are free.
if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
return SDValue();
// If this a zero extend to i64, we should only extend to i32 and use a free
// zero extend to finish.
EVT ExtendVT = TargetVT;
if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
ExtendVT = MVT::i32;
CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
CMovN.getOperand(2), CMovN.getOperand(3));
// Finish extending if needed.
if (ExtendVT != TargetVT)
Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
return Res;
// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
// This is more or less the reverse of combineBitcastvxi1.
static SDValue
combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
Opcode != ISD::ANY_EXTEND)
return SDValue();
if (!DCI.isBeforeLegalizeOps())
return SDValue();
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
return SDValue();
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT SVT = VT.getScalarType();
EVT InSVT = N0.getValueType().getScalarType();
unsigned EltSizeInBits = SVT.getSizeInBits();
// Input type must be extending a bool vector (bit-casted from a scalar
// integer) to legal integer types.
if (!VT.isVector())
return SDValue();
if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
return SDValue();
if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
return SDValue();
SDValue N00 = N0.getOperand(0);
EVT SclVT = N0.getOperand(0).getValueType();
if (!SclVT.isScalarInteger())
return SDValue();
SDLoc DL(N);
SDValue Vec;
SmallVector<int, 32> ShuffleMask;
unsigned NumElts = VT.getVectorNumElements();
assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
// Broadcast the scalar integer to the vector elements.
if (NumElts > EltSizeInBits) {
// If the scalar integer is greater than the vector element size, then we
// must split it down into sub-sections for broadcasting. For example:
// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
unsigned Scale = NumElts / EltSizeInBits;
EVT BroadcastVT =
EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
Vec = DAG.getBitcast(VT, Vec);
for (unsigned i = 0; i != Scale; ++i)
ShuffleMask.append(EltSizeInBits, i);
Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
// If we have register broadcast instructions, use the scalar size as the
// element type for the shuffle. Then cast to the wider element type. The
// widened bits won't be used, and this might allow the use of a broadcast
// load.
assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
unsigned Scale = EltSizeInBits / NumElts;
EVT BroadcastVT =
EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
ShuffleMask.append(NumElts * Scale, 0);
Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
Vec = DAG.getBitcast(VT, Vec);
} else {
// For smaller scalar integers, we can simply any-extend it to the vector
// element size (we don't care about the upper bits) and broadcast it to all
// elements.
SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
ShuffleMask.append(NumElts, 0);
Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
// Now, mask the relevant bit in each element.
SmallVector<SDValue, 32> Bits;
for (unsigned i = 0; i != NumElts; ++i) {
int BitIdx = (i % EltSizeInBits);
APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
Bits.push_back(DAG.getConstant(Bit, DL, SVT));
SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
// Compare against the bitmask and extend the result.
EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
// For SEXT, this is now done, otherwise shift the result down for
// zero-extension.
if (Opcode == ISD::SIGN_EXTEND)
return Vec;
return DAG.getNode(ISD::SRL, DL, VT, Vec,
DAG.getConstant(EltSizeInBits - 1, DL, VT));
// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
// result type.
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
SDLoc dl(N);
// Only do this combine with AVX512 for vector extends.
if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
return SDValue();
// Only combine legal element types.
EVT SVT = VT.getVectorElementType();
if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
return SDValue();
// We can only do this if the vector size in 256 bits or less.
unsigned Size = VT.getSizeInBits();
if (Size > 256 && Subtarget.useAVX512Regs())
return SDValue();
// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
// that's the only integer compares with we have.
ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
if (ISD::isUnsignedIntSetCC(CC))
return SDValue();
// Only do this combine if the extension will be fully consumed by the setcc.
EVT N00VT = N0.getOperand(0).getValueType();
EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
if (Size != MatchingVecType.getSizeInBits())
return SDValue();
SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
if (N->getOpcode() == ISD::ZERO_EXTEND)
Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
return Res;
static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
EVT InVT = N0.getValueType();
SDLoc DL(N);
// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
if (!DCI.isBeforeLegalizeOps() &&
N0.getOpcode() == X86ISD::SETCC_CARRY) {
SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
bool ReplaceOtherUses = !N0.hasOneUse();
DCI.CombineTo(N, Setcc);
// Replace other uses with a truncate of the widened setcc_carry.
if (ReplaceOtherUses) {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
N0.getValueType(), Setcc);
DCI.CombineTo(N0.getNode(), Trunc);
return SDValue(N, 0);
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
return NewCMov;
if (!DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
// Invert and sign-extend a boolean is the same as zero-extend and subtract
// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
// sext (xor Bool, -1) --> sub (zext Bool), 1
SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
if (VT.isVector())
if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
return R;
if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
return NewAdd;
return SDValue();
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
// Let legalize expand this if it isn't a legal type yet.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!TLI.isTypeLegal(VT))
return SDValue();
EVT ScalarVT = VT.getScalarType();
if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
return SDValue();
SDValue A = N->getOperand(IsStrict ? 1 : 0);
SDValue B = N->getOperand(IsStrict ? 2 : 1);
SDValue C = N->getOperand(IsStrict ? 3 : 2);
auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool LegalOperations = !DCI.isBeforeLegalizeOps();
if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
CodeSize)) {
V = NegV;
return true;
// Look through extract_vector_elts. If it comes from an FNEG, create a
// new extract from the FNEG input.
if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isNullConstant(V.getOperand(1))) {
SDValue Vec = V.getOperand(0);
if (SDValue NegV = TLI.getCheaperNegatedExpression(
Vec, DAG, LegalOperations, CodeSize)) {
V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
NegV, V.getOperand(1));
return true;
return false;
// Do not convert the passthru input of scalar intrinsics.
// FIXME: We could allow negations of the lower element only.
bool NegA = invertIfNegative(A);
bool NegB = invertIfNegative(B);
bool NegC = invertIfNegative(C);
if (!NegA && !NegB && !NegC)
return SDValue();
unsigned NewOpcode =
negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
if (IsStrict) {
assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
{N->getOperand(0), A, B, C});
} else {
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
return DAG.getNode(NewOpcode, dl, VT, A, B, C);
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool LegalOperations = !DCI.isBeforeLegalizeOps();
SDValue N2 = N->getOperand(2);
SDValue NegN2 =
TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
if (!NegN2)
return SDValue();
unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
NegN2, N->getOperand(3));
return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
// FIXME: Is this needed? We don't seem to have any tests for it.
if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
N0.getOpcode() == X86ISD::SETCC_CARRY) {
SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
bool ReplaceOtherUses = !N0.hasOneUse();
DCI.CombineTo(N, Setcc);
// Replace other uses with a truncate of the widened setcc_carry.
if (ReplaceOtherUses) {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
N0.getValueType(), Setcc);
DCI.CombineTo(N0.getNode(), Trunc);
return SDValue(N, 0);
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
return NewCMov;
if (DCI.isBeforeLegalizeOps())
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
if (VT.isVector())
if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
return R;
if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
return NewAdd;
if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
return R;
// TODO: Combine with any target/faux shuffle.
if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
return concatSubVectors(N00, N01, DAG, dl);
return SDValue();
/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
/// recognizable memcmp expansion.
static bool isOrXorXorTree(SDValue X, bool Root = true) {
if (X.getOpcode() == ISD::OR)
return isOrXorXorTree(X.getOperand(0), false) &&
isOrXorXorTree(X.getOperand(1), false);
if (Root)
return false;
return X.getOpcode() == ISD::XOR;
/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
/// expansion.
template<typename F>
static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
SDValue Op0 = X.getOperand(0);
SDValue Op1 = X.getOperand(1);
if (X.getOpcode() == ISD::OR) {
SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
if (VecVT != CmpVT)
return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
if (HasPT)
return DAG.getNode(ISD::OR, DL, VecVT, A, B);
return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
} else if (X.getOpcode() == ISD::XOR) {
SDValue A = SToV(Op0);
SDValue B = SToV(Op1);
if (VecVT != CmpVT)
return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
if (HasPT)
return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
/// Try to map a 128-bit or larger integer comparison to vector instructions
/// before type legalization splits it up into chunks.
static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
// We're looking for an oversized integer equality comparison.
SDValue X = SetCC->getOperand(0);
SDValue Y = SetCC->getOperand(1);
EVT OpVT = X.getValueType();
unsigned OpSize = OpVT.getSizeInBits();
if (!OpVT.isScalarInteger() || OpSize < 128)
return SDValue();
// Ignore a comparison with zero because that gets special treatment in
// EmitTest(). But make an exception for the special case of a pair of
// logically-combined vector-sized operands compared to zero. This pattern may
// be generated by the memcmp expansion pass with oversized integer compares
// (see PR33325).
bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
return SDValue();
// Don't perform this combine if constructing the vector will be expensive.
auto IsVectorBitCastCheap = [](SDValue X) {
X = peekThroughBitcasts(X);
return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
X.getOpcode() == ISD::LOAD;
if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
return SDValue();
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
(OpSize == 256 && Subtarget.hasAVX()) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
bool HasPT = Subtarget.hasSSE41();
// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
// vector registers are essentially free. (Technically, widening registers
// prevents load folding, but the tradeoff is worth it.)
bool PreferKOT = Subtarget.preferMaskRegisters();
bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
EVT VecVT = MVT::v16i8;
EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
if (OpSize == 256) {
VecVT = MVT::v32i8;
CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
EVT CastVT = VecVT;
bool NeedsAVX512FCast = false;
if (OpSize == 512 || NeedZExt) {
if (Subtarget.hasBWI()) {
VecVT = MVT::v64i8;
CmpVT = MVT::v64i1;
if (OpSize == 512)
CastVT = VecVT;
} else {
VecVT = MVT::v16i32;
CmpVT = MVT::v16i1;
CastVT = OpSize == 512 ? VecVT :
OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
NeedsAVX512FCast = true;
auto ScalarToVector = [&](SDValue X) -> SDValue {
bool TmpZext = false;
EVT TmpCastVT = CastVT;
if (X.getOpcode() == ISD::ZERO_EXTEND) {
SDValue OrigX = X.getOperand(0);
unsigned OrigSize = OrigX.getScalarValueSizeInBits();
if (OrigSize < OpSize) {
if (OrigSize == 128) {
TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
X = OrigX;
TmpZext = true;
} else if (OrigSize == 256) {
TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
X = OrigX;
TmpZext = true;
X = DAG.getBitcast(TmpCastVT, X);
if (!NeedZExt && !TmpZext)
return X;
DAG.getConstant(0, DL, VecVT), X,
DAG.getVectorIdxConstant(0, DL));
SDValue Cmp;
if (IsOrXorXorTreeCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
// Use 2 vector equality compares and 'and' the results before doing a
Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
} else {
SDValue VecX = ScalarToVector(X);
SDValue VecY = ScalarToVector(Y);
if (VecVT != CmpVT) {
Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
} else if (HasPT) {
Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
} else {
Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
// AVX512 should emit a setcc that will lower to kortest.
if (VecVT != CmpVT) {
EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
DAG.getConstant(0, DL, KRegVT), CC);
if (HasPT) {
SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
assert(Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target");
SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
return SDValue();
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
const SDValue LHS = N->getOperand(0);
const SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT OpVT = LHS.getValueType();
SDLoc DL(N);
if (CC == ISD::SETNE || CC == ISD::SETEQ) {
if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
return V;
if (VT == MVT::i1 && isNullConstant(RHS)) {
SDValue X86CC;
if (SDValue V =
MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
return DAG.getNode(ISD::TRUNCATE, DL, VT,
DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
// Using temporaries to avoid messing up operand ordering for later
// transformations if this doesn't work.
SDValue Op0 = LHS;
SDValue Op1 = RHS;
ISD::CondCode TmpCC = CC;
// Put build_vector on the right.
if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
std::swap(Op0, Op1);
TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
bool IsSEXT0 =
(Op0.getOpcode() == ISD::SIGN_EXTEND) &&
(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
if (IsSEXT0 && IsVZero1) {
assert(VT == Op0.getOperand(0).getValueType() &&
"Unexpected operand type");
if (TmpCC == ISD::SETGT)
return DAG.getConstant(0, DL, VT);
if (TmpCC == ISD::SETLE)
return DAG.getConstant(1, DL, VT);
if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
return DAG.getNOT(DL, Op0.getOperand(0), VT);
assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
"Unexpected condition code!");
return Op0.getOperand(0);
// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
// pre-promote its result type since vXi1 vectors don't get promoted
// during type legalization.
// NOTE: The element count check is to ignore operand types that need to
// go through type promotion to a 128-bit vector.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
VT.getVectorElementType() == MVT::i1 &&
(OpVT.getVectorElementType() == MVT::i8 ||
OpVT.getVectorElementType() == MVT::i16)) {
SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
// to avoid scalarization via legalization because v4i32 is not a legal type.
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
LHS.getValueType() == MVT::v4f32)
return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
return SDValue();
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Src = N->getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = N->getSimpleValueType(0);
unsigned NumBits = VT.getScalarSizeInBits();
unsigned NumElts = SrcVT.getVectorNumElements();
// Perform constant folding.
if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
assert(VT == MVT::i32 && "Unexpected result type");
APInt Imm(32, 0);
for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
if (!Src.getOperand(Idx).isUndef() &&
return DAG.getConstant(Imm, SDLoc(N), VT);
// Look through int->fp bitcasts that don't change the element width.
unsigned EltWidth = SrcVT.getScalarSizeInBits();
if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
// Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
// with scalar comparisons.
if (SDValue NotSrc = IsNOT(Src, DAG)) {
SDLoc DL(N);
APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
NotSrc = DAG.getBitcast(SrcVT, NotSrc);
return DAG.getNode(ISD::XOR, DL, VT,
DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
DAG.getConstant(NotMask, DL, VT));
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getAllOnesValue(NumBits));
if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
return SDValue();
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
// With vector masks we only demand the upper bit of the mask.
SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
return SDValue();
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
SDValue Index, SDValue Base, SDValue Scale,
SelectionDAG &DAG) {
SDLoc DL(GorS);
if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
Gather->getMask(), Base, Index, Scale } ;
return DAG.getMaskedGather(Gather->getVTList(),
Gather->getMemoryVT(), DL, Ops,
auto *Scatter = cast<MaskedScatterSDNode>(GorS);
SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
Scatter->getMask(), Base, Index, Scale };
return DAG.getMaskedScatter(Scatter->getVTList(),
Scatter->getMemoryVT(), DL,
Ops, Scatter->getMemOperand(),
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
auto *GorS = cast<MaskedGatherScatterSDNode>(N);
SDValue Index = GorS->getIndex();
SDValue Base = GorS->getBasePtr();
SDValue Scale = GorS->getScale();
if (DCI.isBeforeLegalize()) {
unsigned IndexWidth = Index.getScalarValueSizeInBits();
// Shrink constant indices if they are larger than 32-bits.
// Only do this before legalize types since v2i64 could become v2i32.
// FIXME: We could check that the type is legal if we're after legalize
// types, but then we would need to construct test cases where that happens.
// FIXME: We could support more than just constant vectors, but we need to
// careful with costing. A truncate that can be optimized out would be fine.
// Otherwise we might only want to create a truncate if it avoids a split.
if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
if (BV->isConstant() && IndexWidth > 32 &&
DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
unsigned NumElts = Index.getValueType().getVectorNumElements();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
// Shrink any sign/zero extends from 32 or smaller to larger than 32 if
// there are sufficient sign bits. Only do this before legalize types to
// avoid creating illegal types in truncate.
if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
Index.getOpcode() == ISD::ZERO_EXTEND) &&
IndexWidth > 32 &&
Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
unsigned NumElts = Index.getValueType().getVectorNumElements();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
if (DCI.isBeforeLegalizeOps()) {
unsigned IndexWidth = Index.getScalarValueSizeInBits();
// Make sure the index is either i32 or i64
if (IndexWidth != 32 && IndexWidth != 64) {
MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
// With vector masks we only demand the upper bit of the mask.
SDValue Mask = GorS->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
if (N->getOpcode() != ISD::DELETED_NODE)
return SDValue(N, 0);
return SDValue();
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
SDValue EFLAGS = N->getOperand(1);
// Try to simplify the EFLAGS and condition code operands.
if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
return getSETCC(CC, Flags, DL, DAG);
return SDValue();
/// Optimize branch condition evaluation.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue EFLAGS = N->getOperand(3);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
// Try to simplify the EFLAGS and condition code operands.
// Make sure to not keep references to operands, as combineSetCCEFLAGS can
// RAUW them under us.
if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
N->getOperand(1), Cond, Flags);
return SDValue();
// TODO: Could we move this to DAGCombine?
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
SelectionDAG &DAG) {
// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
// to optimize away operation when it's from a constant.
// The general transformation is:
// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
// AND(VECTOR_CMP(x,y), constant2)
// constant2 = UNARYOP(constant)
// Early exit if this isn't a vector operation, the operand of the
// unary operation isn't a bitwise AND, or if the sizes of the operations
// aren't the same.
EVT VT = N->getValueType(0);
bool IsStrict = N->isStrictFPOpcode();
unsigned NumEltBits = VT.getScalarSizeInBits();
SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
VT.getSizeInBits() != Op0.getValueSizeInBits())
return SDValue();
// Now check that the other operand of the AND is a constant. We could
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
// Everything checks out. Build up the new and improved node.
SDLoc DL(N);
EVT IntVT = BV->getValueType(0);
// Create a new constant of the appropriate type for the transformed
// DAG.
SDValue SourceConst;
if (IsStrict)
SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
{N->getOperand(0), SDValue(BV, 0)});
SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
SDValue Res = DAG.getBitcast(VT, NewAnd);
if (IsStrict)
return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
return Res;
return SDValue();
/// If we are converting a value to floating-point, try to replace scalar
/// truncate of an extracted vector element with a bitcast. This tries to keep
/// the sequence on XMM registers rather than moving between vector and GPRs.
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
// TODO: This is currently only used by combineSIntToFP, but it is generalized
// to allow being called by any similar cast opcode.
// TODO: Consider merging this into lowering: vectorizeExtractedCast().
SDValue Trunc = N->getOperand(0);
if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
return SDValue();
SDValue ExtElt = Trunc.getOperand(0);
if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
return SDValue();
EVT TruncVT = Trunc.getValueType();
EVT SrcVT = ExtElt.getValueType();
unsigned DestWidth = TruncVT.getSizeInBits();
unsigned SrcWidth = SrcVT.getSizeInBits();
if (SrcWidth % DestWidth != 0)
return SDValue();
// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
unsigned VecWidth = SrcVecVT.getSizeInBits();
unsigned NumElts = VecWidth / DestWidth;
EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
SDLoc DL(N);
SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
BitcastVec, ExtElt.getOperand(1));
return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
bool IsStrict = N->isStrictFPOpcode();
SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
SDLoc dl(N);
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
{N->getOperand(0), P});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
// the optimization here.
if (DAG.SignBitIsZero(Op0)) {
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
{N->getOperand(0), Op0});
return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
return SDValue();
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
bool IsStrict = N->isStrictFPOpcode();
if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
return Res;
// Now move on to more general possibilities.
SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
SDLoc dl(N);
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
{N->getOperand(0), P});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
// Without AVX512DQ we only support i64 to float scalar conversion. For both
// vectors and scalars, see if we know that the upper bits are all the sign
// bit, in which case we can truncate the input to i32 and convert from that.
if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
unsigned BitWidth = InVT.getScalarSizeInBits();
unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
if (NumSignBits >= (BitWidth - 31)) {
EVT TruncVT = MVT::i32;
if (InVT.isVector())
TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
SDLoc dl(N);
if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
if (IsStrict)
return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
{N->getOperand(0), Trunc});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
// If we're after legalize and the type is v2i32 we need to shuffle and
// use CVTSI2P.
assert(InVT == MVT::v2i64 && "Unexpected VT!");
SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
{ 0, 2, -1, -1 });
if (IsStrict)
return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
{N->getOperand(0), Shuf});
return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
// a 32-bit target where SSE doesn't support i64->FP operations.
if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
Op0.getOpcode() == ISD::LOAD) {
LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
// This transformation is not supported if the result type is f16 or f128.
if (VT == MVT::f16 || VT == MVT::f128)
return SDValue();
// If we have AVX512DQ we can use packed conversion instructions unless
// the VT is f80.
if (Subtarget.hasDQI() && VT != MVT::f80)
return SDValue();
if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
std::pair<SDValue, SDValue> Tmp =
VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
return Tmp.first;
if (IsStrict)
return SDValue();
if (SDValue V = combineToFPTruncExtElt(N, DAG))
return V;
return SDValue();
static bool needCarryOrOverflowFlag(SDValue Flags) {
assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
X86::CondCode CC;
switch (User->getOpcode()) {
// Be conservative.
return true;
case X86ISD::SETCC:
CC = (X86::CondCode)User->getConstantOperandVal(0);
case X86ISD::BRCOND:
CC = (X86::CondCode)User->getConstantOperandVal(2);
case X86ISD::CMOV:
CC = (X86::CondCode)User->getConstantOperandVal(2);
switch (CC) {
default: break;
case X86::COND_A: case X86::COND_AE:
case X86::COND_B: case X86::COND_BE:
case X86::COND_O: case X86::COND_NO:
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
return true;
return false;
static bool onlyZeroFlagUsed(SDValue Flags) {
assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
UI != UE; ++UI) {
SDNode *User = *UI;
unsigned CCOpNo;
switch (User->getOpcode()) {
// Be conservative.
return false;
case X86ISD::SETCC: CCOpNo = 0; break;
case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
case X86ISD::BRCOND: CCOpNo = 2; break;
case X86ISD::CMOV: CCOpNo = 2; break;
X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
if (CC != X86::COND_E && CC != X86::COND_NE)
return false;
return true;
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
// Only handle test patterns.
if (!isNullConstant(N->getOperand(1)))
return SDValue();
// If we have a CMP of a truncated binop, see if we can make a smaller binop
// and use its flags directly.
// TODO: Maybe we should try promoting compares that only use the zero flag
// first if we can prove the upper bits with computeKnownBits?
SDLoc dl(N);
SDValue Op = N->getOperand(0);
EVT VT = Op.getValueType();
// If we have a constant logical shift that's only used in a comparison
// against zero turn it into an equivalent AND. This allows turning it into
// a TEST instruction later.
if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
onlyZeroFlagUsed(SDValue(N, 0))) {
unsigned BitWidth = VT.getSizeInBits();
const APInt &ShAmt = Op.getConstantOperandAPInt(1);
if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
APInt Mask = Op.getOpcode() == ISD::SRL
? APInt::getHighBitsSet(BitWidth, MaskBits)
: APInt::getLowBitsSet(BitWidth, MaskBits);
if (Mask.isSignedIntN(32)) {
Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
DAG.getConstant(Mask, dl, VT));
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, VT));
// Look for a truncate with a single use.
if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
return SDValue();
Op = Op.getOperand(0);
// Arithmetic op can only have one use.
if (!Op.hasOneUse())
return SDValue();
unsigned NewOpc;
switch (Op.getOpcode()) {
default: return SDValue();
case ISD::AND:
// Skip and with constant. We have special handling for and with immediate
// during isel to generate test instructions.
if (isa<ConstantSDNode>(Op.getOperand(1)))
return SDValue();
NewOpc = X86ISD::AND;
case ISD::OR: NewOpc = X86ISD::OR; break;
case ISD::XOR: NewOpc = X86ISD::XOR; break;
case ISD::ADD:
// If the carry or overflow flag is used, we can't truncate.
if (needCarryOrOverflowFlag(SDValue(N, 0)))
return SDValue();
NewOpc = X86ISD::ADD;
case ISD::SUB:
// If the carry or overflow flag is used, we can't truncate.
if (needCarryOrOverflowFlag(SDValue(N, 0)))
return SDValue();
NewOpc = X86ISD::SUB;
// We found an op we can narrow. Truncate its inputs.
SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
// Use a X86 specific opcode to avoid DAG combine messing with it.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
// For AND, keep a CMP so that we can match the test pattern.
if (NewOpc == X86ISD::AND)
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, dl, VT));
// Return the flags.
return Op.getValue(1);
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
"Expected X86ISD::ADD or X86ISD::SUB");
SDLoc DL(N);
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
MVT VT = LHS.getSimpleValueType();
unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
// If we don't use the flag result, simplify back to a generic ADD/SUB.
if (!N->hasAnyUseOfValue(1)) {
SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
// Fold any similar generic ADD/SUB opcodes to reuse this node.
auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
SDValue Ops[] = {N0, N1};
SDVTList VTs = DAG.getVTList(N->getValueType(0));
if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
SDValue Op(N, 0);
if (Negate)
Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
DCI.CombineTo(GenericAddSub, Op);
MatchGeneric(LHS, RHS, false);
MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
return SDValue();
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
N->getOperand(0), N->getOperand(1),
// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
// iff the flag result is dead.
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
Op0.getOperand(1), N->getOperand(2));
return SDValue();
static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
// If the LHS and RHS of the ADC node are zero, then it can't overflow and
// the result is either zero or one (depending on the input carry bit).
// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
if (X86::isZeroNode(N->getOperand(0)) &&
X86::isZeroNode(N->getOperand(1)) &&
// We don't have a good way to replace an EFLAGS use, so only do this when
// dead right now.
SDValue(N, 1).use_empty()) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
SDValue Res1 =
DAG.getNode(ISD::AND, DL, VT,
DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
DAG.getConstant(1, DL, VT));
return DCI.CombineTo(N, Res1, CarryOut);
if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
MVT VT = N->getSimpleValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
N->getOperand(0), N->getOperand(1),
return SDValue();
/// If this is an add or subtract where one operand is produced by a cmp+setcc,
/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
/// with CMP+{ADC, SBB}.
static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
bool IsSub = N->getOpcode() == ISD::SUB;
SDValue X = N->getOperand(0);
SDValue Y = N->getOperand(1);
// If this is an add, canonicalize a zext operand to the RHS.
// TODO: Incomplete? What if both sides are zexts?
if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
Y.getOpcode() != ISD::ZERO_EXTEND)
std::swap(X, Y);
// Look through a one-use zext.
bool PeekedThroughZext = false;
if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
Y = Y.getOperand(0);
PeekedThroughZext = true;
// If this is an add, canonicalize a setcc operand to the RHS.
// TODO: Incomplete? What if both sides are setcc?
// TODO: Should we allow peeking through a zext of the other operand?
if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
Y.getOpcode() != X86ISD::SETCC)
std::swap(X, Y);
if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
return SDValue();
SDLoc DL(N);
EVT VT = N->getValueType(0);
X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
// If X is -1 or 0, then we have an opportunity to avoid constants required in
// the general case below.
auto *ConstantX = dyn_cast<ConstantSDNode>(X);
if (ConstantX) {
if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
// This is a complicated way to get -1 or 0 from the carry flag:
// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
SDValue EFLAGS = Y->getOperand(1);
if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
// Swap the operands of a SUB, and we have the same pattern as above.
// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
SDValue NewSub = DAG.getNode(
X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
if (CC == X86::COND_B) {
// X + SETB Z --> adc X, 0
// X - SETB Z --> sbb X, 0
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
DAG.getVTList(VT, MVT::i32), X,
DAG.getConstant(0, DL, VT), Y.getOperand(1));
if (CC == X86::COND_A) {
SDValue EFLAGS = Y.getOperand(1);
// Try to convert COND_A into COND_B in an attempt to facilitate
// materializing "setb reg".
// Do not flip "e > c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
DAG.getVTList(VT, MVT::i32), X,
DAG.getConstant(0, DL, VT), NewEFLAGS);
if (CC == X86::COND_AE) {
// X + SETAE --> sbb X, -1
// X - SETAE --> adc X, -1
return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
DAG.getVTList(VT, MVT::i32), X,
DAG.getConstant(-1, DL, VT), Y.getOperand(1));
if (CC == X86::COND_BE) {
// X + SETBE --> sbb X, -1
// X - SETBE --> adc X, -1
SDValue EFLAGS = Y.getOperand(1);
// Try to convert COND_BE into COND_AE in an attempt to facilitate
// materializing "setae reg".
// Do not flip "e <= c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
SDValue NewSub = DAG.getNode(
X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
DAG.getVTList(VT, MVT::i32), X,
DAG.getConstant(-1, DL, VT), NewEFLAGS);
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
SDValue Cmp = Y.getOperand(1);
if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
!X86::isZeroNode(Cmp.getOperand(1)) ||
return SDValue();
SDValue Z = Cmp.getOperand(0);
EVT ZVT = Z.getValueType();
// If X is -1 or 0, then we have an opportunity to avoid constants required in
// the general case below.
if (ConstantX) {
// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
// fake operands:
// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
SDValue Zero = DAG.getConstant(0, DL, ZVT);
SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
SDValue(Neg.getNode(), 1));
// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
// with fake operands:
// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
SDValue One = DAG.getConstant(1, DL, ZVT);
SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
// (cmp Z, 1) sets the carry flag if Z is 0.
SDValue One = DAG.getConstant(1, DL, ZVT);
SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
// Add the flags type for ADC/SBB nodes.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
if (CC == X86::COND_NE)
return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
// Example of pattern we try to detect:
// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
//(add (build_vector (extract_elt t, 0),
// (extract_elt t, 2),
// (extract_elt t, 4),
// (extract_elt t, 6)),
// (build_vector (extract_elt t, 1),
// (extract_elt t, 3),
// (extract_elt t, 5),
// (extract_elt t, 7)))
if (!Subtarget.hasSSE2())
return SDValue();
if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
Op1.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
VT.getVectorNumElements() < 4 ||
return SDValue();
// Check if one of Op0,Op1 is of the form:
// (build_vector (extract_elt Mul, 0),
// (extract_elt Mul, 2),
// (extract_elt Mul, 4),
// ...
// the other is of the form:
// (build_vector (extract_elt Mul, 1),
// (extract_elt Mul, 3),
// (extract_elt Mul, 5),
// ...
// and identify Mul.
SDValue Mul;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
// TODO: Be more tolerant to undefs.
if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
if (!Const0L || !Const1L || !Const0H || !Const1H)
return SDValue();
unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
// Commutativity of mul allows factors of a product to reorder.
if (Idx0L > Idx1L)
std::swap(Idx0L, Idx1L);
if (Idx0H > Idx1H)
std::swap(Idx0H, Idx1H);
// Commutativity of add allows pairs of factors to reorder.
if (Idx0L > Idx0H) {
std::swap(Idx0L, Idx0H);
std::swap(Idx1L, Idx1H);
if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
Idx1H != 2 * i + 3)
return SDValue();
if (!Mul) {
// First time an extract_elt's source vector is visited. Must be a MUL
// with 2X number of vector elements than the BUILD_VECTOR.
// Both extracts must be from same MUL.
Mul = Op0L->getOperand(0);
if (Mul->getOpcode() != ISD::MUL ||
Mul.getValueType().getVectorNumElements() != 2 * e)
return SDValue();
// Check that the extract is from the same MUL previously seen.
if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
return SDValue();
// Check if the Mul source can be safely shrunk.
ShrinkMode Mode;
if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
Mode == ShrinkMode::MULU16)
return SDValue();
EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
VT.getVectorNumElements() * 2);
SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
EVT InVT = Ops[0].getValueType();
assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
// Attempt to turn this pattern into PMADDWD.
// (add (mul (sext (build_vector)), (sext (build_vector))),
// (mul (sext (build_vector)), (sext (build_vector)))
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasSSE2())
return SDValue();
if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
return SDValue();
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
VT.getVectorNumElements() < 4 ||
return SDValue();
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
SDValue N10 = N1.getOperand(0);
SDValue N11 = N1.getOperand(1);
// All inputs need to be sign extends.
// TODO: Support ZERO_EXTEND from known positive?
if (N00.getOpcode() != ISD::SIGN_EXTEND ||
N01.getOpcode() != ISD::SIGN_EXTEND ||
N10.getOpcode() != ISD::SIGN_EXTEND ||
N11.getOpcode() != ISD::SIGN_EXTEND)
return SDValue();
// Peek through the extends.
N00 = N00.getOperand(0);
N01 = N01.getOperand(0);
N10 = N10.getOperand(0);
N11 = N11.getOperand(0);
// Must be extending from vXi16.
EVT InVT = N00.getValueType();
if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
N10.getValueType() != InVT || N11.getValueType() != InVT)
return SDValue();
// All inputs should be build_vectors.
if (N00.getOpcode() != ISD::BUILD_VECTOR ||
N01.getOpcode() != ISD::BUILD_VECTOR ||
N10.getOpcode() != ISD::BUILD_VECTOR ||
N11.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// For each element, we need to ensure we have an odd element from one vector
// multiplied by the odd element of another vector and the even element from
// one of the same vectors being multiplied by the even element from the
// other vector. So we need to make sure for each element i, this operator
// is being performed:
// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
SDValue In0, In1;
for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
SDValue N00Elt = N00.getOperand(i);
SDValue N01Elt = N01.getOperand(i);
SDValue N10Elt = N10.getOperand(i);
SDValue N11Elt = N11.getOperand(i);
// TODO: Be more tolerant to undefs.
if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
return SDValue();
unsigned IdxN00 = ConstN00Elt->getZExtValue();
unsigned IdxN01 = ConstN01Elt->getZExtValue();
unsigned IdxN10 = ConstN10Elt->getZExtValue();
unsigned IdxN11 = ConstN11Elt->getZExtValue();
// Add is commutative so indices can be reordered.
if (IdxN00 > IdxN10) {
std::swap(IdxN00, IdxN10);
std::swap(IdxN01, IdxN11);
// N0 indices be the even element. N1 indices must be the next odd element.
if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
return SDValue();
SDValue N00In = N00Elt.getOperand(0);
SDValue N01In = N01Elt.getOperand(0);
SDValue N10In = N10Elt.getOperand(0);
SDValue N11In = N11Elt.getOperand(0);
// First time we find an input capture it.
if (!In0) {
In0 = N00In;
In1 = N01In;
// Mul is commutative so the input vectors can be in any order.
// Canonicalize to make the compares easier.
if (In0 != N00In)
std::swap(N00In, N01In);
if (In0 != N10In)
std::swap(N10In, N11In);
if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
return SDValue();
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
EVT OpVT = Ops[0].getValueType();
assert(OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type");
assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
OpVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
bool IsAdd = N->getOpcode() == ISD::ADD;
assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
+ SmallVector<int, 8> PostShuffleMask;
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
Subtarget.hasSSSE3() &&
- isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
- DL, Ops[0].getValueType(), Ops);
+ return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
+ Ops[0].getValueType(), Ops);
- return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
- HOpBuilder);
+ SDValue HorizBinOp =
+ SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
+ if (!PostShuffleMask.empty())
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+ DAG.getUNDEF(VT), PostShuffleMask);
+ return HorizBinOp;
return SDValue();
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
return MAdd;
if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
return MAdd;
// Try to synthesize horizontal adds from adds of shuffles.
if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
return V;
// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
// (sub Y, (sext (vXi1 X))).
// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
// generic DAG combine without a legal type check, but adding this there
// caused regressions.
if (VT.isVector()) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
SDLoc DL(N);
SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
SDLoc DL(N);
SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
return combineAddOrSubToADCOrSBB(N, DAG);
static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
if (!VT.isVector())
return SDValue();
// PSUBUS is supported, starting from SSE2, but truncation for v8i32
// is only worth it with SSSE3 (PSHUFB).
EVT EltVT = VT.getVectorElementType();
if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
!(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
return SDValue();
SDValue SubusLHS, SubusRHS;
// Try to find umax(a,b) - b or a - umin(a,b) patterns
// they may be converted to subus(a,b).
// TODO: Need to add IR canonicalization for this code.
if (Op0.getOpcode() == ISD::UMAX) {
SubusRHS = Op1;
SDValue MaxLHS = Op0.getOperand(0);
SDValue MaxRHS = Op0.getOperand(1);
if (MaxLHS == Op1)
SubusLHS = MaxRHS;
else if (MaxRHS == Op1)
SubusLHS = MaxLHS;
return SDValue();
} else if (Op1.getOpcode() == ISD::UMIN) {
SubusLHS = Op0;
SDValue MinLHS = Op1.getOperand(0);
SDValue MinRHS = Op1.getOperand(1);
if (MinLHS == Op0)
SubusRHS = MinRHS;
else if (MinRHS == Op0)
SubusRHS = MinLHS;
return SDValue();
} else if (Op1.getOpcode() == ISD::TRUNCATE &&
Op1.getOperand(0).getOpcode() == ISD::UMIN &&
(EltVT == MVT::i8 || EltVT == MVT::i16)) {
// Special case where the UMIN has been truncated. Try to push the truncate
// further up. This is similar to the i32/i64 special processing.
SubusLHS = Op0;
SDValue MinLHS = Op1.getOperand(0).getOperand(0);
SDValue MinRHS = Op1.getOperand(0).getOperand(1);
EVT TruncVT = Op1.getOperand(0).getValueType();
if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
TruncVT == MVT::v8i64)) &&
!(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
return SDValue();
SDValue OpToSaturate;
if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
MinLHS.getOperand(0) == Op0)
OpToSaturate = MinRHS;
else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
MinRHS.getOperand(0) == Op0)
OpToSaturate = MinLHS;
return SDValue();
// Saturate the non-extended input and then truncate it.
SDLoc DL(N);
SDValue SaturationConst =
DL, TruncVT);
SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
} else
return SDValue();
// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
// special preprocessing in some cases.
if (EltVT == MVT::i8 || EltVT == MVT::i16)
return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&
"Unexpected VT!");
// Special preprocessing case can be only applied
// if the value was zero extended from 16 bit,
// so we require first 16 bits to be zeros for 32 bit
// values, or first 48 bits for 64 bit values.
KnownBits Known = DAG.computeKnownBits(SubusLHS);
unsigned NumZeros = Known.countMinLeadingZeros();
if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
return SDValue();
EVT ExtType = SubusLHS.getValueType();
EVT ShrinkedType;
if (VT == MVT::v8i32 || VT == MVT::v8i64)
ShrinkedType = MVT::v8i16;
ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
// If SubusLHS is zeroextended - truncate SubusRHS to it's
// size SubusRHS = umin(0xFFF.., SubusRHS).
SDValue SaturationConst =
SDLoc(SubusLHS), ExtType);
SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
SDValue NewSubusLHS =
DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
NewSubusLHS, NewSubusRHS);
// Zero extend the result, it may be used somewhere as 32 bit,
// if not zext and following trunc will shrink.
return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
// X86 can't encode an immediate LHS of a sub. See if we can push the
// negation into a preceding instruction.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
// If the RHS of the sub is a XOR with one use and a constant, invert the
// immediate. Then add one to the LHS of the sub so we can turn
// X-Y -> X+~Y+1, saving one register.
if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
isa<ConstantSDNode>(Op1.getOperand(1))) {
const APInt &XorC = Op1.getConstantOperandAPInt(1);
EVT VT = Op0.getValueType();
SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
DAG.getConstant(~XorC, SDLoc(Op1), VT));
return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
// Try to synthesize horizontal subs from subs of shuffles.
if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
return V;
// Try to create PSUBUS if SUB's argument is max/min
if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
return V;
return combineAddOrSubToADCOrSBB(N, DAG);
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
if (N->getOperand(0) == N->getOperand(1)) {
if (N->getOpcode() == X86ISD::PCMPEQ)
return DAG.getConstant(-1, DL, VT);
if (N->getOpcode() == X86ISD::PCMPGT)
return DAG.getConstant(0, DL, VT);
return SDValue();
/// Helper that combines an array of subvector ops as if they were the operands
/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ArrayRef<SDValue> Ops, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
unsigned EltSizeInBits = VT.getScalarSizeInBits();
if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
return DAG.getUNDEF(VT);
if (llvm::all_of(Ops, [](SDValue Op) {
return ISD::isBuildVectorAllZeros(Op.getNode());
return getZeroVector(VT, Subtarget, DAG, DL);
SDValue Op0 = Ops[0];
bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
// Fold subvector loads into one.
// If needed, look through bitcasts to get to the load.
if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
bool Fast;
const X86TargetLowering *TLI = Subtarget.getTargetLowering();
if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
*FirstLd->getMemOperand(), &Fast) &&
Fast) {
if (SDValue Ld =
EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
return Ld;
// Repeated subvectors.
if (IsSplat) {
// If this broadcast/subv_broadcast is inserted into both halves, use a
// larger broadcast/subv_broadcast.
if (Op0.getOpcode() == X86ISD::VBROADCAST ||
Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
// If this broadcast_load is inserted into both halves, use a larger
// broadcast_load. Update other uses to use an extracted subvector.
if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
SDValue BcastLd = DAG.getMemIntrinsicNode(
X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
return BcastLd;
// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
(Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getIntPtrConstant(0, DL)));
// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
(Subtarget.hasAVX2() ||
(EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
Op0.getOperand(0).getValueType() == VT.getScalarType())
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
// concat_vectors(extract_subvector(broadcast(x)),
// extract_subvector(broadcast(x))) -> broadcast(x)
if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Op0.getOperand(0).getValueType() == VT) {
if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
return Op0.getOperand(0);
// Repeated opcode.
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
// but it currently struggles with different vector widths.
if (llvm::all_of(Ops, [Op0](SDValue Op) {
return Op.getOpcode() == Op0.getOpcode();
})) {
unsigned NumOps = Ops.size();
switch (Op0.getOpcode()) {
case X86ISD::SHUFP: {
// Add SHUFPD support if/when necessary.
if (!IsSplat && VT.getScalarType() == MVT::f32 &&
llvm::all_of(Ops, [Op0](SDValue Op) {
return Op.getOperand(2) == Op0.getOperand(2);
})) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
return DAG.getNode(Op0.getOpcode(), DL, VT,
case X86ISD::PSHUFD:
if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
SmallVector<SDValue, 2> Src;
for (unsigned i = 0; i != NumOps; ++i)
return DAG.getNode(Op0.getOpcode(), DL, VT,
// TODO - add support for vXf64/vXi64 shuffles.
if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
SmallVector<SDValue, 2> Src;
for (unsigned i = 0; i != NumOps; ++i)
Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
return DAG.getBitcast(VT, Res);
case X86ISD::VSHLI:
case X86ISD::VSRAI:
case X86ISD::VSRLI:
if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
llvm::all_of(Ops, [Op0](SDValue Op) {
return Op0.getOperand(1) == Op.getOperand(1);
})) {
SmallVector<SDValue, 2> Src;
for (unsigned i = 0; i != NumOps; ++i)
return DAG.getNode(Op0.getOpcode(), DL, VT,
case X86ISD::VPERMI:
case X86ISD::VROTLI:
case X86ISD::VROTRI:
if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
llvm::all_of(Ops, [Op0](SDValue Op) {
return Op0.getOperand(1) == Op.getOperand(1);
})) {
SmallVector<SDValue, 2> Src;
for (unsigned i = 0; i != NumOps; ++i)
return DAG.getNode(Op0.getOpcode(), DL, VT,
case X86ISD::PACKSS:
case X86ISD::PACKUS:
if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
Subtarget.hasInt256()) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
NumOps * SrcVT.getVectorNumElements());
return DAG.getNode(Op0.getOpcode(), DL, VT,
if (!IsSplat &&
((VT.is256BitVector() && Subtarget.hasInt256()) ||
(VT.is512BitVector() && Subtarget.useBWIRegs())) &&
llvm::all_of(Ops, [Op0](SDValue Op) {
return Op0.getOperand(2) == Op.getOperand(2);
})) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
return DAG.getNode(Op0.getOpcode(), DL, VT,
return SDValue();
static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
EVT SrcVT = N->getOperand(0).getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Don't do anything for i1 vectors.
if (VT.getVectorElementType() == MVT::i1)
return SDValue();
if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
DCI, Subtarget))
return R;
return SDValue();
static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
MVT OpVT = N->getSimpleValueType(0);
bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
SDLoc dl(N);
SDValue Vec = N->getOperand(0);
SDValue SubVec = N->getOperand(1);
uint64_t IdxVal = N->getConstantOperandVal(2);
MVT SubVecVT = SubVec.getSimpleValueType();
if (Vec.isUndef() && SubVec.isUndef())
return DAG.getUNDEF(OpVT);
// Inserting undefs/zeros into zeros/undefs is a zero vector.
if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
return getZeroVector(OpVT, Subtarget, DAG, dl);
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
// If we're inserting into a zero vector and then into a larger zero vector,
// just insert into the larger zero vector directly.
if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
// If we're inserting into a zero vector and our input was extracted from an
// insert into a zero vector of the same type and the extraction was at
// least as large as the original insertion. Just insert the original
// subvector into a zero vector.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
isNullConstant(SubVec.getOperand(1)) &&
SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Ins = SubVec.getOperand(0);
if (isNullConstant(Ins.getOperand(2)) &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
Ins.getOperand(1), N->getOperand(2));
// Stop here if this is an i1 vector.
if (IsI1Vector)
return SDValue();
// If this is an insert of an extract, combine to a shuffle. Don't do this
// if the insert or extract can be represented with a subregister operation.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
SubVec.getOperand(0).getSimpleValueType() == OpVT &&
(IdxVal != 0 ||
!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
int ExtIdxVal = SubVec.getConstantOperandVal(1);
if (ExtIdxVal != 0) {
int VecNumElts = OpVT.getVectorNumElements();
int SubVecNumElts = SubVecVT.getVectorNumElements();
SmallVector<int, 64> Mask(VecNumElts);
// First create an identity shuffle mask.
for (int i = 0; i != VecNumElts; ++i)
Mask[i] = i;
// Now insert the extracted portion.
for (int i = 0; i != SubVecNumElts; ++i)
Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
// Match concat_vector style patterns.
SmallVector<SDValue, 2> SubVectorOps;
if (collectConcatOps(N, SubVectorOps)) {
if (SDValue Fold =
combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
return Fold;
// If we're inserting all zeros into the upper half, change this to
// a concat with zero. We will match this to a move
// with implicit upper bit zeroing during isel.
// We do this here because we don't want combineConcatVectorOps to
if (SubVectorOps.size() == 2 &&
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
// If this is a broadcast insert into an upper undef, use a larger broadcast.
if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
// If this is a broadcast load inserted into an upper undef, use a larger
// broadcast load.
if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
return BcastLd;
return SDValue();
/// If we are extracting a subvector of a vector select and the select condition
/// is composed of concatenated vectors, try to narrow the select width. This
/// is a common pattern for AVX1 integer code because 256-bit selects may be
/// legal, but there is almost no integer math/logic available for 256-bit.
/// This function should only be called with legal types (otherwise, the calls
/// to get simple value types will assert).
static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
SmallVector<SDValue, 4> CatOps;
if (Sel.getOpcode() != ISD::VSELECT ||
!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
return SDValue();
// Note: We assume simple value types because this should only be called with
// legal operations/types.
// TODO: This can be extended to handle extraction to 256-bits.
MVT VT = Ext->getSimpleValueType(0);
if (!VT.is128BitVector())
return SDValue();
MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
return SDValue();
MVT WideVT = Ext->getOperand(0).getSimpleValueType();
MVT SelVT = Sel.getSimpleValueType();
assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations");
unsigned SelElts = SelVT.getVectorNumElements();
unsigned CastedElts = WideVT.getVectorNumElements();
unsigned ExtIdx = Ext->getConstantOperandVal(1);
if (SelElts % CastedElts == 0) {
// The select has the same or more (narrower) elements than the extract
// operand. The extraction index gets scaled by that factor.
ExtIdx *= (SelElts / CastedElts);
} else if (CastedElts % SelElts == 0) {
// The select has less (wider) elements than the extract operand. Make sure
// that the extraction index can be divided evenly.
unsigned IndexDivisor = CastedElts / SelElts;
if (ExtIdx % IndexDivisor != 0)
return SDValue();
ExtIdx /= IndexDivisor;
} else {
llvm_unreachable("Element count of simple vector types are not divisible?");
unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
unsigned NarrowElts = SelElts / NarrowingFactor;
MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
SDLoc DL(Ext);
SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
return DAG.getBitcast(VT, NarrowSel);
static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
// For AVX1 only, if we are extracting from a 256-bit and+not (which will
// eventually get combined/lowered into ANDNP) with a concatenated operand,
// split the 'and' into 128-bit ops to avoid the concatenate and extract.
// We let generic combining take over from there to simplify the
// insert/extract and 'not'.
// This pattern emerges during AVX1 legalization. We handle it before lowering
// to avoid complications like splitting constant vector loads.
// Capture the original wide type in the likely case that we need to bitcast
// back to this type.
if (!N->getValueType(0).isSimple())
return SDValue();
MVT VT = N->getSimpleValueType(0);
SDValue InVec = N->getOperand(0);
unsigned IdxVal = N->getConstantOperandVal(1);
SDValue InVecBC = peekThroughBitcasts(InVec);
EVT InVecVT = InVec.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
TLI.isTypeLegal(InVecVT) &&
InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
auto isConcatenatedNot = [] (SDValue V) {
V = peekThroughBitcasts(V);
if (!isBitwiseNot(V))
return false;
SDValue NotOp = V->getOperand(0);
return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
if (isConcatenatedNot(InVecBC.getOperand(0)) ||
isConcatenatedNot(InVecBC.getOperand(1))) {
// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue V = narrowExtractedVectorSelect(N, DAG))
return V;
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
if (VT.getScalarType() == MVT::i1)
return DAG.getConstant(1, SDLoc(N), VT);
return getOnesVector(VT, DAG, SDLoc(N));
if (InVec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getBuildVector(
VT, SDLoc(N),
InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
// If we are extracting from an insert into a zero vector, replace with a
// smaller insert into zero if we don't access less than the original
// subvector. Don't do this for i1 vectors.
if (VT.getVectorElementType() != MVT::i1 &&
InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
SDLoc DL(N);
getZeroVector(VT, Subtarget, DAG, DL),
InVec.getOperand(1), InVec.getOperand(2));
// If we're extracting from a broadcast then we're better off just
// broadcasting to the smaller type directly, assuming this is the only use.
// As its a broadcast we don't care about the extraction index.
if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
return BcastLd;
// If we're extracting an upper subvector from a broadcast we should just
// extract the lowest subvector instead which should allow
// SimplifyDemandedVectorElts do more simplifications.
if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());
// If we're extracting a broadcasted subvector, just use the source.
if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
InVec.getOperand(0).getValueType() == VT)
return InVec.getOperand(0);
// Attempt to extract from the source of a shuffle vector.
if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&
(IdxVal % VT.getVectorNumElements()) == 0) {
SmallVector<int, 32> ShuffleMask;
SmallVector<int, 32> ScaledMask;
SmallVector<SDValue, 2> ShuffleInputs;
unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();
// Decode the shuffle mask and scale it so its shuffling subvectors.
if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
return DAG.getUNDEF(VT);
if (ScaledMask[SubVecIdx] == SM_SentinelZero)
return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {
unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
SDLoc(N), VT.getSizeInBits());
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
unsigned InOpcode = InVec.getOpcode();
if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
// v2f64 CVTUDQ2PD(v4i32).
if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
// v2f64 CVTPS2PD(v4f32).
if (InOpcode == ISD::FP_EXTEND &&
InVec.getOperand(0).getValueType() == MVT::v4f32) {
return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
if ((InOpcode == ISD::ANY_EXTEND ||
InOpcode == ISD::ZERO_EXTEND ||
InOpcode == ISD::SIGN_EXTEND ||
VT.is128BitVector() &&
InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
if (InOpcode == ISD::VSELECT &&
InVec.getOperand(0).getValueType().is256BitVector() &&
InVec.getOperand(1).getValueType().is256BitVector() &&
InVec.getOperand(2).getValueType().is256BitVector()) {
SDLoc DL(N);
SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
return SDValue();
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
SDLoc DL(N);
// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
// This occurs frequently in our masked scalar intrinsic code and our
// floating point select lowering with AVX512.
// TODO: SimplifyDemandedBits instead?
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->getAPIntValue().isOneValue())
return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
if (C->isNullValue())
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
// Reduce v2i64 to v4i32 if we don't need the upper bits.
// TODO: Move to DAGCombine/SimplifyDemandedBits?
if (VT == MVT::v2i64 || VT == MVT::v2f64) {
auto IsAnyExt64 = [](SDValue Op) {
if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
return SDValue();
if (Op.getOpcode() == ISD::ANY_EXTEND &&
Op.getOperand(0).getScalarValueSizeInBits() <= 32)
return Op.getOperand(0);
if (auto *Ld = dyn_cast<LoadSDNode>(Op))
if (Ld->getExtensionType() == ISD::EXTLOAD &&
Ld->getMemoryVT().getScalarSizeInBits() <= 32)
return Op;
return SDValue();
if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
return DAG.getBitcast(
DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
Src.getOperand(0).getValueType() == MVT::x86mmx)
return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
return SDValue();
// Simplify PMULDQ and PMULUDQ operations.
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
// Canonicalize constant to RHS.
if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
// Multiply by zero.
// Don't return RHS as it may contain UNDEFs.
if (ISD::isBuildVectorAllZeros(RHS.getNode()))
return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
return SDValue(N, 0);
// If the input is an extend_invec and the SimplifyDemandedBits call didn't
// convert it to any_extend_invec, due to the LegalOperations check, do the
// conversion directly to a vector shuffle manually. This exposes combine
// opportunities missed by combineExtInVec not calling
// combineX86ShufflesRecursively on SSE4.1 targets.
// FIXME: This is basically a hack around several other issues related to
if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
LHS.getOperand(0).getValueType() == MVT::v4i32) {
SDLoc dl(N);
LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
LHS.getOperand(0), { 0, -1, 1, -1 });
LHS = DAG.getBitcast(MVT::v2i64, LHS);
return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
RHS.getOperand(0).getValueType() == MVT::v4i32) {
SDLoc dl(N);
RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
RHS.getOperand(0), { 0, -1, 1, -1 });
RHS = DAG.getBitcast(MVT::v2i64, RHS);
return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
return SDValue();
static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Try to merge vector loads and extend_inreg to an extload.
if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
In.hasOneUse()) {
auto *Ld = cast<LoadSDNode>(In);
if (Ld->isSimple()) {
MVT SVT = In.getSimpleValueType().getVectorElementType();
ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
SDValue Load =
DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), MemVT,
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
return Load;
// Attempt to combine as a shuffle.
// TODO: SSE41 support
if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
SDValue Op(N, 0);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;
return SDValue();
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
KnownZero, DCI))
return SDValue(N, 0);
return SDValue();
// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
// extra instructions between the conversion due to going to scalar and back.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
return SDValue();
if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
return SDValue();
if (N->getValueType(0) != MVT::f32 ||
N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
return SDValue();
SDLoc dl(N);
SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
DAG.getTargetConstant(4, dl, MVT::i32));
Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
DAG.getIntPtrConstant(0, dl));
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
return SDValue();
bool IsStrict = N->isStrictFPOpcode();
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
return SDValue();
if (VT.getVectorElementType() != MVT::f32 &&
VT.getVectorElementType() != MVT::f64)
return SDValue();
unsigned NumElts = VT.getVectorNumElements();
if (NumElts == 1 || !isPowerOf2_32(NumElts))
return SDValue();
SDLoc dl(N);
// Convert the input to vXi16.
EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
Src = DAG.getBitcast(IntVT, Src);
// Widen to at least 8 input elements.
if (NumElts < 8) {
unsigned NumConcats = 8 / NumElts;
SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
: DAG.getConstant(0, dl, IntVT);
SmallVector<SDValue, 4> Ops(NumConcats, Fill);
Ops[0] = Src;
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
// Destination is vXf32 with at least 4 elements.
EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
std::max(4U, NumElts));
SDValue Cvt, Chain;
if (IsStrict) {
Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
{N->getOperand(0), Src});
Chain = Cvt.getValue(1);
} else {
Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
if (NumElts < 4) {
assert(NumElts == 2 && "Unexpected size");
Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
DAG.getIntPtrConstant(0, dl));
if (IsStrict) {
// Extend to the original VT if necessary.
if (Cvt.getValueType() != VT) {
Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
{Chain, Cvt});
Chain = Cvt.getValue(1);
return DAG.getMergeValues({Cvt, Chain}, dl);
// Extend to the original VT if necessary.
return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
// cases where the loads have the same input chain and the output chains are
// unused. This avoids any memory ordering issues.
static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
// Only do this if the chain result is unused.
if (N->hasAnyUseOfValue(1))
return SDValue();
auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
SDValue Ptr = MemIntrin->getBasePtr();
SDValue Chain = MemIntrin->getChain();
EVT VT = N->getSimpleValueType(0);
EVT MemVT = MemIntrin->getMemoryVT();
// Look at other users of our base pointer and try to find a wider broadcast.
// The input chain and the size of the memory VT must match.
for (SDNode *User : Ptr->uses())
if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
MemVT.getSizeInBits() &&
!User->hasAnyUseOfValue(1) &&
User->getValueSizeInBits(0) > VT.getSizeInBits()) {
SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
Extract = DAG.getBitcast(VT, Extract);
return DCI.CombineTo(N, Extract, SDValue(User, 1));
return SDValue();
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
return SDValue();
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
EVT SrcVT = Src.getValueType();
if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
SrcVT.getVectorElementType() != MVT::f32)
return SDValue();
unsigned NumElts = VT.getVectorNumElements();
if (NumElts == 1 || !isPowerOf2_32(NumElts))
return SDValue();
SDLoc dl(N);
// Widen to at least 4 input elements.
if (NumElts < 4)
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getConstantFP(0.0, dl, SrcVT));
// Destination is v8i16 with at least 8 elements.
EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
std::max(8U, NumElts));
SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
DAG.getTargetConstant(4, dl, MVT::i32));
// Extract down to real number of elements.
if (NumElts < 8) {
EVT IntVT = VT.changeVectorElementTypeToInteger();
Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(VT, Cvt);
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
SDValue Src = N->getOperand(0);
// Turn MOVDQ2Q+simple_load into an mmx load.
if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
if (LN->isSimple()) {
SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
return NewLd;
return SDValue();
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default: break;
return combineScalarToVector(N, DAG);
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
return combineConcatVectors(N, DAG, DCI, Subtarget);
return combineInsertSubvector(N, DAG, DCI, Subtarget);
return combineExtractSubvector(N, DAG, DCI, Subtarget);
case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
case X86ISD::CMP: return combineCMP(N, DAG);
case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
case ISD::SHL: return combineShiftLeft(N, DAG);
case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
return combineSIntToFP(N, DAG, DCI, Subtarget);
return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
case X86ISD::FXOR:
case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
case X86ISD::FMIN:
case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
case X86ISD::CVTP2SI:
case X86ISD::CVTP2UI:
case X86ISD::CVTTP2SI:
case X86ISD::CVTTP2UI:
return combineCVTP2I_CVTTP2I(N, DAG, DCI);
case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
case X86ISD::BT: return combineBT(N, DAG, DCI);
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
case X86ISD::PACKSS:
case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
case X86ISD::VSHL:
case X86ISD::VSRA:
case X86ISD::VSRL:
return combineVectorShiftVar(N, DAG, DCI, Subtarget);
case X86ISD::VSHLI:
case X86ISD::VSRAI:
case X86ISD::VSRLI:
return combineVectorShiftImm(N, DAG, DCI, Subtarget);
case X86ISD::PINSRB:
case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::EXTRQI:
case X86ISD::VALIGN:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
case X86ISD::BLENDI:
case X86ISD::UNPCKH:
case X86ISD::UNPCKL:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
case X86ISD::VPPERM:
case X86ISD::VPERMI:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
case X86ISD::VPERMIL2:
case X86ISD::VPERM2X128:
case X86ISD::SHUF128:
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMSUB:
case X86ISD::FNMADD:
case X86ISD::FNMSUB:
case ISD::FMA:
case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
return SDValue();
bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
if (!isTypeLegal(VT))
return false;
// There are no vXi8 shifts.
if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
return false;
// TODO: Almost no 8-bit ops are desirable because they have no actual
// size/speed advantages vs. 32-bit ops, but they do have a major
// potential disadvantage by causing partial register stalls.
// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
// we have specializations to turn 32-bit multiply/shl into LEA or other ops.
// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
// check for a constant operand to the multiply.
if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
return false;
// i16 instruction encodings are longer and some i16 instructions are slow,
// so those are not desirable.
if (VT == MVT::i16) {
switch (Opc) {
case ISD::LOAD:
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::SUB:
case ISD::ADD:
case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
return false;
// Any legal type not explicitly accounted for above here is desirable.
return true;
SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
SDValue Value, SDValue Addr,
SelectionDAG &DAG) const {
const Module *M = DAG.getMachineFunction().getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
if (IsCFProtectionSupported) {
// In case control-flow branch protection is enabled, we need to add
// notrack prefix to the indirect branch.
// In order to do that we create NT_BRIND SDNode.
// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
EVT VT = Op.getValueType();
bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
// i16 is legal, but undesirable since i16 instruction encodings are longer
// and some i16 instructions are slow.
// 8-bit multiply-by-constant can usually be expanded to something cheaper
// using LEA and/or other ALU ops.
if (VT != MVT::i16 && !Is8BitMulByConstant)
return false;
auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
if (!Op.hasOneUse())
return false;
SDNode *User = *Op->use_begin();
if (!ISD::isNormalStore(User))
return false;
auto *Ld = cast<LoadSDNode>(Load);
auto *St = cast<StoreSDNode>(User);
return Ld->getBasePtr() == St->getBasePtr();
auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
return false;
if (!Op.hasOneUse())
return false;
SDNode *User = *Op->use_begin();
if (User->getOpcode() != ISD::ATOMIC_STORE)
return false;
auto *Ld = cast<AtomicSDNode>(Load);
auto *St = cast<AtomicSDNode>(User);
return Ld->getBasePtr() == St->getBasePtr();
bool Commute = false;
switch (Op.getOpcode()) {
default: return false;
case ISD::SHL:
case ISD::SRA:
case ISD::SRL: {
SDValue N0 = Op.getOperand(0);
// Look out for (store (shl (load), x)).
if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
return false;
case ISD::ADD:
case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
Commute = true;
case ISD::SUB: {
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
// Avoid disabling potential load folding opportunities.
if (MayFoldLoad(N1) &&
(!Commute || !isa<ConstantSDNode>(N0) ||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
return false;
if (MayFoldLoad(N0) &&
((Commute && !isa<ConstantSDNode>(N1)) ||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
return false;
if (IsFoldableAtomicRMW(N0, Op) ||
(Commute && IsFoldableAtomicRMW(N1, Op)))
return false;
PVT = MVT::i32;
return true;
// X86 Inline Assembly Support
// Helper to match a string separated by whitespace.
static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
for (StringRef Piece : Pieces) {
if (!S.startswith(Piece)) // Check if the piece matches.
return false;
S = S.substr(Piece.size());
StringRef::size_type Pos = S.find_first_not_of(" \t");
if (Pos == 0) // We matched a prefix.
return false;
S = S.substr(Pos);
return S.empty();
static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
if (AsmPieces.size() == 3)
return true;
else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
return true;
return false;
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
const std::string &AsmStr = IA->getAsmString();
IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
SmallVector<StringRef, 4> AsmPieces;
SplitString(AsmStr, AsmPieces, ";\n");
switch (AsmPieces.size()) {
default: return false;
case 1:
// FIXME: this should verify that we are targeting a 486 or better. If not,
// we will turn this bswap into something that will be lowered to logical
// ops instead of emitting the bswap asm. For now, we don't support 486 or
// lower so don't worry about this.
// bswap $0
if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
return IntrinsicLowering::LowerToByteSwap(CI);
// rorw $$8, ${0:w} --> llvm.bswap.i16
if (CI->getType()->isIntegerTy(16) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
StringRef ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
array_pod_sort(AsmPieces.begin(), AsmPieces.end());
if (clobbersFlagRegisters(AsmPieces))
return IntrinsicLowering::LowerToByteSwap(CI);
case 3:
if (CI->getType()->isIntegerTy(32) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
StringRef ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
array_pod_sort(AsmPieces.begin(), AsmPieces.end());
if (clobbersFlagRegisters(AsmPieces))
return IntrinsicLowering::LowerToByteSwap(CI);
if (CI->getType()->isIntegerTy(64)) {
InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
if (Constraints.size() >= 2 &&
Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
return IntrinsicLowering::LowerToByteSwap(CI);
return false;
static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
.Case("{@cca}", X86::COND_A)
.Case("{@ccae}", X86::COND_AE)
.Case("{@ccb}", X86::COND_B)
.Case("{@ccbe}", X86::COND_BE)
.Case("{@ccc}", X86::COND_B)
.Case("{@cce}", X86::COND_E)
.Case("{@ccz}", X86::COND_E)
.Case("{@ccg}", X86::COND_G)
.Case("{@ccge}", X86::COND_GE)
.Case("{@ccl}", X86::COND_L)
.Case("{@ccle}", X86::COND_LE)
.Case("{@ccna}", X86::COND_BE)
.Case("{@ccnae}", X86::COND_B)
.Case("{@ccnb}", X86::COND_AE)
.Case("{@ccnbe}", X86::COND_A)
.Case("{@ccnc}", X86::COND_AE)
.Case("{@ccne}", X86::COND_NE)
.Case("{@ccnz}", X86::COND_NE)
.Case("{@ccng}", X86::COND_LE)
.Case("{@ccnge}", X86::COND_L)
.Case("{@ccnl}", X86::COND_GE)
.Case("{@ccnle}", X86::COND_G)
.Case("{@ccno}", X86::COND_NO)
.Case("{@ccnp}", X86::COND_P)
.Case("{@ccns}", X86::COND_NS)
.Case("{@cco}", X86::COND_O)
.Case("{@ccp}", X86::COND_P)
.Case("{@ccs}", X86::COND_S)
return Cond;
/// Given a constraint letter, return the type of constraint for this target.
X86TargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'R':
case 'q':
case 'Q':
case 'f':
case 't':
case 'u':
case 'y':
case 'x':
case 'v':
case 'l':
case 'k': // AVX512 masking registers.
return C_RegisterClass;
case 'a':
case 'b':
case 'c':
case 'd':
case 'S':
case 'D':
case 'A':
return C_Register;
case 'I':
case 'J':
case 'K':
case 'N':
case 'G':
case 'L':
case 'M':
return C_Immediate;
case 'C':
case 'e':
case 'Z':
return C_Other;
else if (Constraint.size() == 2) {
switch (Constraint[0]) {
case 'Y':
switch (Constraint[1]) {
case 'z':
return C_Register;
case 'i':
case 'm':
case 'k':
case 't':
case '2':
return C_RegisterClass;
} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
return C_Other;
return TargetLowering::getConstraintType(Constraint);
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
case 'R':
case 'q':
case 'Q':
case 'a':
case 'b':
case 'c':
case 'd':
case 'S':
case 'D':
case 'A':
if (CallOperandVal->getType()->isIntegerTy())
weight = CW_SpecificReg;
case 'f':
case 't':
case 'u':
if (type->isFloatingPointTy())
weight = CW_SpecificReg;
case 'y':
if (type->isX86_MMXTy() && Subtarget.hasMMX())
weight = CW_SpecificReg;
case 'Y':
if (StringRef(constraint).size() != 2)
switch (constraint[1]) {
return CW_Invalid;
// XMM0
case 'z':
if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
return CW_SpecificReg;
return CW_Invalid;
// Conditional OpMask regs (AVX512)
case 'k':
if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
return CW_Register;
return CW_Invalid;
// Any MMX reg
case 'm':
if (type->isX86_MMXTy() && Subtarget.hasMMX())
return weight;
return CW_Invalid;
// Any SSE reg when ISA >= SSE2, same as 'x'
case 'i':
case 't':
case '2':
if (!Subtarget.hasSSE2())
return CW_Invalid;
case 'v':
if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
weight = CW_Register;
case 'x':
if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
weight = CW_Register;
case 'k':
// Enable conditional vector operations using %k<#> registers.
if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
weight = CW_Register;
case 'I':
if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
if (C->getZExtValue() <= 31)
weight = CW_Constant;
case 'J':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 63)
weight = CW_Constant;
case 'K':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
weight = CW_Constant;
case 'L':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
weight = CW_Constant;
case 'M':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 3)
weight = CW_Constant;
case 'N':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 0xff)
weight = CW_Constant;
case 'G':
case 'C':
if (isa<ConstantFP>(CallOperandVal)) {
weight = CW_Constant;
case 'e':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if ((C->getSExtValue() >= -0x80000000LL) &&
(C->getSExtValue() <= 0x7fffffffLL))
weight = CW_Constant;
case 'Z':
if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
if (C->getZExtValue() <= 0xffffffff)
weight = CW_Constant;
return weight;
/// Try to replace an X constraint, which matches anything, with another that
/// has more specific requirements based on the type of the corresponding
/// operand.
const char *X86TargetLowering::
LowerXConstraint(EVT ConstraintVT) const {
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
// 'f' like normal targets.
if (ConstraintVT.isFloatingPoint()) {
if (Subtarget.hasSSE1())
return "x";
return TargetLowering::LowerXConstraint(ConstraintVT);
// Lower @cc targets via setcc.
SDValue X86TargetLowering::LowerAsmOutputForConstraint(
SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
SelectionDAG &DAG) const {
X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
if (Cond == X86::COND_INVALID)
return SDValue();
// Check that return type is valid.
if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
OpInfo.ConstraintVT.getSizeInBits() < 8)
report_fatal_error("Flag output operand is of invalid type");
// Get EFLAGS register. Only update chain when copyfrom is glued.
if (Flag.getNode()) {
Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
Chain = Flag.getValue(1);
} else
Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
// Extract CC code.
SDValue CC = getSETCC(Cond, Flag, DL, DAG);
// Extend to 32-bits
SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
return Result;
/// Lower the specified operand into the Ops vector.
/// If it is invalid, don't add anything to Ops.
void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
std::string &Constraint,
SelectionDAG &DAG) const {
SDValue Result;
// Only support length 1 constraints for now.
if (Constraint.length() > 1) return;
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
default: break;
case 'I':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 31) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
case 'J':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 63) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
case 'K':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (isInt<8>(C->getSExtValue())) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
case 'L':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
case 'M':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 3) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
case 'N':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 255) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
case 'O':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 127) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
case 'e': {
// 32-bit signed value
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getSExtValue())) {
// Widen to 64 bits here to get it sign extended.
Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
// FIXME gcc accepts some relocatable values here too, but only in certain
// memory models; it's complicated.
case 'Z': {
// 32-bit unsigned value
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getZExtValue())) {
Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
// FIXME gcc accepts some relocatable values here too, but only in certain
// memory models; it's complicated.
case 'i': {
// Literal immediates are always ok.
if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
BooleanContent BCont = getBooleanContents(MVT::i64);
ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
: CST->getSExtValue();
Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
// In any sort of PIC mode addresses need to be computed at runtime by
// adding in a register or some sort of table lookup. These can't
// be used as immediates.
if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
// If we are in non-pic codegen mode, we allow the address of a global (with
// an optional displacement) to be used with 'i'.
if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
// If we require an extra load to get this address, as in PIC mode, we
// can't accept it.
if (isGlobalStubReference(
if (Result.getNode()) {
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
/// Check if \p RC is a general purpose register class.
/// I.e., GR* or one of their variant.
static bool isGRClass(const TargetRegisterClass &RC) {
return RC.hasSuperClassEq(&X86::GR8RegClass) ||
RC.hasSuperClassEq(&X86::GR16RegClass) ||
RC.hasSuperClassEq(&X86::GR32RegClass) ||
RC.hasSuperClassEq(&X86::GR64RegClass) ||
/// Check if \p RC is a vector register class.
/// I.e., FR* / VR* or one of their variant.
static bool isFRClass(const TargetRegisterClass &RC) {
return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
RC.hasSuperClassEq(&X86::FR64XRegClass) ||
RC.hasSuperClassEq(&X86::VR128XRegClass) ||
RC.hasSuperClassEq(&X86::VR256XRegClass) ||
/// Check if \p RC is a mask register class.
/// I.e., VK* or one of their variant.
static bool isVKClass(const TargetRegisterClass &RC) {
return RC.hasSuperClassEq(&X86::VK1RegClass) ||
RC.hasSuperClassEq(&X86::VK2RegClass) ||
RC.hasSuperClassEq(&X86::VK4RegClass) ||
RC.hasSuperClassEq(&X86::VK8RegClass) ||
RC.hasSuperClassEq(&X86::VK16RegClass) ||
RC.hasSuperClassEq(&X86::VK32RegClass) ||
std::pair<unsigned, const TargetRegisterClass *>
X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
// First, see if this is a constraint that directly corresponds to an LLVM
// register class.
if (Constraint.size() == 1) {
// GCC Constraint Letters
switch (Constraint[0]) {
default: break;
// 'A' means [ER]AX + [ER]DX.
case 'A':
if (Subtarget.is64Bit())
return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
"Expecting 64, 32 or 16 bit subtarget");
return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
// TODO: Slight differences here in allocation order and leaving
// RIP in the class. Do they matter any more here than they do
// in the normal allocation?
case 'k':
if (Subtarget.hasAVX512()) {
if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1RegClass);
if (VT == MVT::i8)
return std::make_pair(0U, &X86::VK8RegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::VK16RegClass);
if (Subtarget.hasBWI()) {
if (VT == MVT::i32)
return std::make_pair(0U, &X86::VK32RegClass);
if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64RegClass);
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
if (Subtarget.is64Bit()) {
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8RegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i32 || VT == MVT::f32)
return std::make_pair(0U, &X86::GR32RegClass);
if (VT != MVT::f80)
return std::make_pair(0U, &X86::GR64RegClass);
// 32-bit fallthrough
case 'Q': // Q_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_ABCDRegClass);
if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32_ABCDRegClass);
if (VT != MVT::f80)
return std::make_pair(0U, &X86::GR64_ABCDRegClass);
case 'r': // GENERAL_REGS
case 'l': // INDEX_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8RegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32RegClass);
if (VT != MVT::f80)
return std::make_pair(0U, &X86::GR64RegClass);
case 'R': // LEGACY_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_NOREXRegClass);
if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32_NOREXRegClass);
if (VT != MVT::f80)
return std::make_pair(0U, &X86::GR64_NOREXRegClass);
case 'f': // FP Stack registers.
// If SSE is enabled for this VT, use f80 to ensure the isel moves the
// value to the correct fpstack register class.
if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
return std::make_pair(0U, &X86::RFP32RegClass);
if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
return std::make_pair(0U, &X86::RFP64RegClass);
if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
return std::make_pair(0U, &X86::RFP80RegClass);
case 'y': // MMX_REGS if MMX allowed.
if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'v':
case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
if (!Subtarget.hasSSE1()) break;
bool VConstraint = (Constraint[0] == 'v');
switch (VT.SimpleTy) {
default: break;
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR32XRegClass);
return std::make_pair(0U, &X86::FR32RegClass);
case MVT::f64:
case MVT::i64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR64XRegClass);
return std::make_pair(0U, &X86::FR64RegClass);
case MVT::i128:
if (Subtarget.is64Bit()) {
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::VR128XRegClass);
return std::make_pair(0U, &X86::VR128RegClass);
// Vector types and fp128.
case MVT::f128:
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::VR128XRegClass);
return std::make_pair(0U, &X86::VR128RegClass);
// AVX types.
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
case MVT::v4i64:
case MVT::v8f32:
case MVT::v4f64:
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::VR256XRegClass);
if (Subtarget.hasAVX())
return std::make_pair(0U, &X86::VR256RegClass);
case MVT::v64i8:
case MVT::v32i16:
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8i64:
if (!Subtarget.hasAVX512()) break;
if (VConstraint)
return std::make_pair(0U, &X86::VR512RegClass);
return std::make_pair(0U, &X86::VR512_0_15RegClass);
} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
switch (Constraint[1]) {
case 'i':
case 't':
case '2':
return getRegForInlineAsmConstraint(TRI, "x", VT);
case 'm':
if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'z':
if (!Subtarget.hasSSE1()) break;
switch (VT.SimpleTy) {
default: break;
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
return std::make_pair(X86::XMM0, &X86::FR32RegClass);
case MVT::f64:
case MVT::i64:
return std::make_pair(X86::XMM0, &X86::FR64RegClass);
case MVT::f128:
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
return std::make_pair(X86::XMM0, &X86::VR128RegClass);
// AVX types.
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
case MVT::v4i64:
case MVT::v8f32:
case MVT::v4f64:
if (Subtarget.hasAVX())
return std::make_pair(X86::YMM0, &X86::VR256RegClass);
case MVT::v64i8:
case MVT::v32i16:
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8i64:
if (Subtarget.hasAVX512())
return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
if (Subtarget.hasAVX512()) {
if (VT == MVT::i1)
return std::make_pair(0U, &X86::VK1WMRegClass);
if (VT == MVT::i8)
return std::make_pair(0U, &X86::VK8WMRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::VK16WMRegClass);
if (Subtarget.hasBWI()) {
if (VT == MVT::i32)
return std::make_pair(0U, &X86::VK32WMRegClass);
if (VT == MVT::i64)
return std::make_pair(0U, &X86::VK64WMRegClass);
if (parseConstraintCode(Constraint) != X86::COND_INVALID)
return std::make_pair(0U, &X86::GR32RegClass);
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<Register, const TargetRegisterClass*> Res;
Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
// Map st(0) -> st(7) -> ST0
if (Constraint.size() == 7 && Constraint[0] == '{' &&
tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
Constraint[3] == '(' &&
(Constraint[4] >= '0' && Constraint[4] <= '7') &&
Constraint[5] == ')' && Constraint[6] == '}') {
// st(7) is not allocatable and thus not a member of RFP80. Return
// singleton class in cases where we have a reference to it.
if (Constraint[4] == '7')
return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
return std::make_pair(X86::FP0 + Constraint[4] - '0',
// GCC allows "st(0)" to be called just plain "st".
if (StringRef("{st}").equals_lower(Constraint))
return std::make_pair(X86::FP0, &X86::RFP80RegClass);
// flags -> EFLAGS
if (StringRef("{flags}").equals_lower(Constraint))
return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
// dirflag -> DF
if (StringRef("{dirflag}").equals_lower(Constraint))
return std::make_pair(X86::DF, &X86::DFCCRRegClass);
// fpsr -> FPSW
if (StringRef("{fpsr}").equals_lower(Constraint))
return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
return Res;
// Make sure it isn't a register that requires 64-bit mode.
if (!Subtarget.is64Bit() &&
(isFRClass(*Res.second) || isGRClass(*Res.second)) &&
TRI->getEncodingValue(Res.first) >= 8) {
// Register requires REX prefix, but we're in 32-bit mode.
return std::make_pair(0, nullptr);
// Make sure it isn't a register that requires AVX512.
if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
TRI->getEncodingValue(Res.first) & 0x10) {
// Register requires EVEX prefix.
return std::make_pair(0, nullptr);
// Otherwise, check to see if this is a register class of the wrong value
// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
// turn into {ax},{dx}.
// MVT::Other is used to specify clobber names.
if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
return Res; // Correct type already, nothing to do.
// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
// return "eax". This should even work for things like getting 64bit integer
// registers when given an f64 type.
const TargetRegisterClass *Class = Res.second;
// The generic code will match the first register class that contains the
// given register. Thus, based on the ordering of the tablegened file,
// the "plain" GR classes might not come first.
// Therefore, use a helper method.
if (isGRClass(*Class)) {
unsigned Size = VT.getSizeInBits();
if (Size == 1) Size = 8;
Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
if (DestReg > 0) {
bool is64Bit = Subtarget.is64Bit();
const TargetRegisterClass *RC =
Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
: nullptr;
if (Size == 64 && !is64Bit) {
// Model GCC's behavior here and select a fixed pair of 32-bit
// registers.
switch (DestReg) {
case X86::RAX:
return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
case X86::RDX:
return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
case X86::RCX:
return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
case X86::RBX:
return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
case X86::RSI:
return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
case X86::RDI:
return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
case X86::RBP:
return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
return std::make_pair(0, nullptr);
if (RC && RC->contains(DestReg))
return std::make_pair(DestReg, RC);
return Res;
// No register found/type mismatch.
return std::make_pair(0, nullptr);
} else if (isFRClass(*Class)) {
// Handle references to XMM physical registers that got mapped into the
// wrong class. This can happen with constraints like {xmm0} where the
// target independent register mapper will just pick the first match it can
// find, ignoring the required type.
// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
if (VT == MVT::f32 || VT == MVT::i32)
Res.second = &X86::FR32XRegClass;
else if (VT == MVT::f64 || VT == MVT::i64)
Res.second = &X86::FR64XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
Res.second = &X86::VR128XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
Res.second = &X86::VR256XRegClass;
else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
Res.second = &X86::VR512RegClass;
else {
// Type mismatch and not a clobber: Return an error;
Res.first = 0;
Res.second = nullptr;
} else if (isVKClass(*Class)) {
if (VT == MVT::i1)
Res.second = &X86::VK1RegClass;
else if (VT == MVT::i8)
Res.second = &X86::VK8RegClass;
else if (VT == MVT::i16)
Res.second = &X86::VK16RegClass;
else if (VT == MVT::i32)
Res.second = &X86::VK32RegClass;
else if (VT == MVT::i64)
Res.second = &X86::VK64RegClass;
else {
// Type mismatch and not a clobber: Return an error;
Res.first = 0;
Res.second = nullptr;
return Res;
int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
// Scaling factors are not free at all.
// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
// will take 2 allocations in the out of order engine instead of 1
// for plain addressing mode, i.e. inst (reg1).
// E.g.,
// vaddps (%rsi,%rdx), %ymm0, %ymm1
// Requires two allocations (one for the load, one for the computation)
// whereas:
// vaddps (%rsi), %ymm0, %ymm1
// Requires just 1 allocation, i.e., freeing allocations for other operations
// and having less micro operations to execute.
// For some X86 architectures, this is even worse because for instance for
// stores, the complex addressing mode forces the instruction to use the
// "load" ports instead of the dedicated "store" port.
// E.g., on Haswell:
// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
if (isLegalAddressingMode(DL, AM, Ty, AS))
// Scale represents reg2 * scale, thus account for 1
// as soon as we use a second register.
return AM.Scale != 0;
return -1;
bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// Integer division on x86 is expensive. However, when aggressively optimizing
// for code size, we prefer to use a div instruction, as it is usually smaller
// than the alternative sequence.
// The exception to this is vector division. Since x86 doesn't have vector
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
return OptSize && !VT.isVector();
void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
if (!Subtarget.is64Bit())
// Update IsSplitCSR in X86MachineFunctionInfo.
X86MachineFunctionInfo *AFI =
void X86TargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (X86::GR64RegClass.contains(*I))
RC = &X86::GR64RegClass;
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
// Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
bool X86TargetLowering::supportSwiftError() const {
return Subtarget.is64Bit();
/// Returns true if stack probing through a function call is requested.
bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
return !getStackProbeSymbolName(MF).empty();
/// Returns true if stack probing through inline assembly is requested.
bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
// No inline stack probe for Windows, they have their own mechanism.
if (Subtarget.isOSWindows() ||
return false;
// If the function specifically requests inline stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
return false;
/// Returns the name of the symbol used to emit stack probes or the empty
/// string if not applicable.
X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
// Inline Stack probes disable stack probe call
if (hasInlineStackProbe(MF))
return "";
// If the function specifically requests stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
// Generally, if we aren't on Windows, the platform ABI does not include
// support for stack probes, so don't emit them.
if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
return "";
// We need a stack probe to conform to the Windows ABI. Choose the right
// symbol.
if (Subtarget.is64Bit())
return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
// The default stack probe size is 4096 if the function has no stackprobesize
// attribute.
unsigned StackProbeSize = 4096;
const Function &Fn = MF.getFunction();
if (Fn.hasFnAttribute("stack-probe-size"))
.getAsInteger(0, StackProbeSize);
return StackProbeSize;
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index c40901255424..cd39428b9c38 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -1,385 +1,387 @@
//===- LibDriver.cpp - lib.exe-compatible driver --------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Defines an interface to a lib.exe-compatible driver that also understands
// bitcode files. Used by llvm-lib and lld-link /lib.
#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/BinaryFormat/Magic.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Object/ArchiveWriter.h"
#include "llvm/Object/COFF.h"
#include "llvm/Object/WindowsMachineFlag.h"
#include "llvm/Option/Arg.h"
#include "llvm/Option/ArgList.h"
#include "llvm/Option/Option.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/StringSaver.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
namespace {
enum {
#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
#include ""
#undef OPTION
#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
#include ""
#undef PREFIX
static const opt::OptTable::Info InfoTable[] = {
#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X7, X8, X9, X10, X11, X12) \
{X1, X2, X10, X11, OPT_##ID, opt::Option::KIND##Class, \
X9, X8, OPT_##GROUP, OPT_##ALIAS, X7, X12},
#include ""
#undef OPTION
class LibOptTable : public opt::OptTable {
LibOptTable() : OptTable(InfoTable, true) {}
static std::string getDefaultOutputPath(const NewArchiveMember &FirstMember) {
SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier());
sys::path::replace_extension(Val, ".lib");
return std::string(Val.str());
static std::vector<StringRef> getSearchPaths(opt::InputArgList *Args,
StringSaver &Saver) {
std::vector<StringRef> Ret;
// Add current directory as first item of the search path.
// Add /libpath flags.
for (auto *Arg : Args->filtered(OPT_libpath))
// Add $LIB.
Optional<std::string> EnvOpt = sys::Process::GetEnv("LIB");
if (!EnvOpt.hasValue())
return Ret;
StringRef Env =*EnvOpt);
while (!Env.empty()) {
StringRef Path;
std::tie(Path, Env) = Env.split(';');
return Ret;
static std::string findInputFile(StringRef File, ArrayRef<StringRef> Paths) {
for (StringRef Dir : Paths) {
SmallString<128> Path = Dir;
sys::path::append(Path, File);
if (sys::fs::exists(Path))
return std::string(Path);
return "";
static void fatalOpenError(llvm::Error E, Twine File) {
if (!E)
handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EIB) {
llvm::errs() << "error opening '" << File << "': " << EIB.message() << '\n';
static void doList(opt::InputArgList& Args) {
// lib.exe prints the contents of the first archive file.
std::unique_ptr<MemoryBuffer> B;
for (auto *Arg : Args.filtered(OPT_INPUT)) {
// Create or open the archive object.
ErrorOr<std::unique_ptr<MemoryBuffer>> MaybeBuf =
MemoryBuffer::getFile(Arg->getValue(), -1, false);
fatalOpenError(errorCodeToError(MaybeBuf.getError()), Arg->getValue());
if (identify_magic(MaybeBuf.get()->getBuffer()) == file_magic::archive) {
B = std::move(MaybeBuf.get());
// lib.exe doesn't print an error if no .lib files are passed.
if (!B)
Error Err = Error::success();
object::Archive Archive(B.get()->getMemBufferRef(), Err);
fatalOpenError(std::move(Err), B->getBufferIdentifier());
for (auto &C : Archive.children(Err)) {
Expected<StringRef> NameOrErr = C.getName();
fatalOpenError(NameOrErr.takeError(), B->getBufferIdentifier());
StringRef Name = NameOrErr.get();
llvm::outs() << Name << '\n';
fatalOpenError(std::move(Err), B->getBufferIdentifier());
static COFF::MachineTypes getCOFFFileMachine(MemoryBufferRef MB) {
std::error_code EC;
auto Obj = object::COFFObjectFile::create(MB);
if (!Obj) {
llvm::errs() << MB.getBufferIdentifier()
<< ": failed to open: " << Obj.takeError() << '\n';
uint16_t Machine = (*Obj)->getMachine();
if (Machine != COFF::IMAGE_FILE_MACHINE_I386 &&
llvm::errs() << MB.getBufferIdentifier() << ": unknown machine: " << Machine
<< '\n';
return static_cast<COFF::MachineTypes>(Machine);
static COFF::MachineTypes getBitcodeFileMachine(MemoryBufferRef MB) {
Expected<std::string> TripleStr = getBitcodeTargetTriple(MB);
if (!TripleStr) {
llvm::errs() << MB.getBufferIdentifier()
<< ": failed to get target triple from bitcode\n";
switch (Triple(*TripleStr).getArch()) {
case Triple::x86:
case Triple::x86_64:
case Triple::arm:
case Triple::aarch64:
llvm::errs() << MB.getBufferIdentifier()
<< ": unknown arch in target triple " << *TripleStr << '\n';
static void appendFile(std::vector<NewArchiveMember> &Members,
COFF::MachineTypes &LibMachine,
std::string &LibMachineSource, MemoryBufferRef MB) {
file_magic Magic = identify_magic(MB.getBuffer());
if (Magic != file_magic::coff_object && Magic != file_magic::bitcode &&
- Magic != file_magic::archive && Magic != file_magic::windows_resource) {
+ Magic != file_magic::archive && Magic != file_magic::windows_resource &&
+ Magic != file_magic::coff_import_library) {
llvm::errs() << MB.getBufferIdentifier()
- << ": not a COFF object, bitcode, archive or resource file\n";
+ << ": not a COFF object, bitcode, archive, import library or "
+ "resource file\n";
// If a user attempts to add an archive to another archive, llvm-lib doesn't
// handle the first archive file as a single file. Instead, it extracts all
// members from the archive and add them to the second archive. This beahvior
// is for compatibility with Microsoft's lib command.
if (Magic == file_magic::archive) {
Error Err = Error::success();
object::Archive Archive(MB, Err);
fatalOpenError(std::move(Err), MB.getBufferIdentifier());
for (auto &C : Archive.children(Err)) {
Expected<MemoryBufferRef> ChildMB = C.getMemoryBufferRef();
if (!ChildMB) {
handleAllErrors(ChildMB.takeError(), [&](const ErrorInfoBase &EIB) {
llvm::errs() << MB.getBufferIdentifier() << ": " << EIB.message()
<< "\n";
appendFile(Members, LibMachine, LibMachineSource, *ChildMB);
fatalOpenError(std::move(Err), MB.getBufferIdentifier());
// Check that all input files have the same machine type.
// Mixing normal objects and LTO bitcode files is fine as long as they
// have the same machine type.
// Doing this here duplicates the header parsing work that writeArchive()
// below does, but it's not a lot of work and it's a bit awkward to do
// in writeArchive() which needs to support many tools, can't assume the
// input is COFF, and doesn't have a good way to report errors.
if (Magic == file_magic::coff_object || Magic == file_magic::bitcode) {
COFF::MachineTypes FileMachine = (Magic == file_magic::coff_object)
? getCOFFFileMachine(MB)
: getBitcodeFileMachine(MB);
// FIXME: Once lld-link rejects multiple resource .obj files:
// Call convertResToCOFF() on .res files and add the resulting
// COFF file to the .lib output instead of adding the .res file, and remove
// this check. See PR42180.
LibMachine = FileMachine;
LibMachineSource =
(" (inferred from earlier file '" + MB.getBufferIdentifier() + "')")
} else if (LibMachine != FileMachine) {
llvm::errs() << MB.getBufferIdentifier() << ": file machine type "
<< machineToStr(FileMachine)
<< " conflicts with library machine type "
<< machineToStr(LibMachine) << LibMachineSource << '\n';
int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
BumpPtrAllocator Alloc;
StringSaver Saver(Alloc);
// Parse command line arguments.
SmallVector<const char *, 20> NewArgs(ArgsArr.begin(), ArgsArr.end());
cl::ExpandResponseFiles(Saver, cl::TokenizeWindowsCommandLine, NewArgs);
ArgsArr = NewArgs;
LibOptTable Table;
unsigned MissingIndex;
unsigned MissingCount;
opt::InputArgList Args =
Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount);
if (MissingCount) {
llvm::errs() << "missing arg value for \""
<< Args.getArgString(MissingIndex) << "\", expected "
<< MissingCount
<< (MissingCount == 1 ? " argument.\n" : " arguments.\n");
return 1;
for (auto *Arg : Args.filtered(OPT_UNKNOWN))
llvm::errs() << "ignoring unknown argument: " << Arg->getAsString(Args)
<< "\n";
// Handle /help
if (Args.hasArg(OPT_help)) {
Table.PrintHelp(outs(), "llvm-lib [options] file...", "LLVM Lib");
return 0;
// If no input files and not told otherwise, silently do nothing to match
// lib.exe
if (!Args.hasArgNoClaim(OPT_INPUT) && !Args.hasArg(OPT_llvmlibempty))
return 0;
if (Args.hasArg(OPT_lst)) {
return 0;
std::vector<StringRef> SearchPaths = getSearchPaths(&Args, Saver);
std::string LibMachineSource;
if (auto *Arg = Args.getLastArg(OPT_machine)) {
LibMachine = getMachineType(Arg->getValue());
llvm::errs() << "unknown /machine: arg " << Arg->getValue() << '\n';
return 1;
LibMachineSource =
std::string(" (from '/machine:") + Arg->getValue() + "' flag)";
std::vector<std::unique_ptr<MemoryBuffer>> MBs;
StringSet<> Seen;
std::vector<NewArchiveMember> Members;
// Create a NewArchiveMember for each input file.
for (auto *Arg : Args.filtered(OPT_INPUT)) {
// Find a file
std::string Path = findInputFile(Arg->getValue(), SearchPaths);
if (Path.empty()) {
llvm::errs() << Arg->getValue() << ": no such file or directory\n";
return 1;
// Input files are uniquified by pathname. If you specify the exact same
// path more than once, all but the first one are ignored.
// Note that there's a loophole in the rule; you can prepend `.\` or
// something like that to a path to make it look different, and they are
// handled as if they were different files. This behavior is compatible with
// Microsoft lib.exe.
if (!Seen.insert(Path).second)
// Open a file.
ErrorOr<std::unique_ptr<MemoryBuffer>> MOrErr =
MemoryBuffer::getFile(Path, -1, false);
fatalOpenError(errorCodeToError(MOrErr.getError()), Path);
MemoryBufferRef MBRef = (*MOrErr)->getMemBufferRef();
// Append a file.
appendFile(Members, LibMachine, LibMachineSource, MBRef);
// Take the ownership of the file buffer to keep the file open.
// Create an archive file.
std::string OutputPath;
if (auto *Arg = Args.getLastArg(OPT_out)) {
OutputPath = Arg->getValue();
} else if (!Members.empty()) {
OutputPath = getDefaultOutputPath(Members[0]);
} else {
llvm::errs() << "no output path given, and cannot infer with no inputs\n";
return 1;
// llvm-lib uses relative paths for both regular and thin archives, unlike
// standard GNU ar, which only uses relative paths for thin archives and
// basenames for regular archives.
for (NewArchiveMember &Member : Members) {
if (sys::path::is_relative(Member.MemberName)) {
Expected<std::string> PathOrErr =
computeArchiveRelativePath(OutputPath, Member.MemberName);
if (PathOrErr)
Member.MemberName =*PathOrErr);
if (Error E =
writeArchive(OutputPath, Members,
/*WriteSymtab=*/true, object::Archive::K_GNU,
/*Deterministic*/ true, Args.hasArg(OPT_llvmlibthin))) {
handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
llvm::errs() << OutputPath << ": " << EI.message() << "\n";
return 1;
return 0;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 7cfe4c8b5892..c7f2f4ec3ca1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1,1876 +1,1875 @@
//===- InstCombineSimplifyDemanded.cpp ------------------------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file contains logic for simplifying instructions based on information
// about how they are used.
#include "InstCombineInternal.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/KnownBits.h"
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "instcombine"
namespace {
struct AMDGPUImageDMaskIntrinsic {
unsigned Intr;
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
#include ""
} // end anonymous namespace
/// Check to see if the specified operand of the specified instruction is a
/// constant integer. If so, check to see if there are any bits set in the
/// constant that are not demanded. If so, shrink the constant and return true.
static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
const APInt &Demanded) {
assert(I && "No instruction?");
assert(OpNo < I->getNumOperands() && "Operand index too large");
// The operand must be a constant integer or splat integer.
Value *Op = I->getOperand(OpNo);
const APInt *C;
if (!match(Op, m_APInt(C)))
return false;
// If there are no bits set that aren't demanded, nothing to do.
if (C->isSubsetOf(Demanded))
return false;
// This instruction is producing bits that are not demanded. Shrink the RHS.
I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded));
return true;
/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
/// the instruction has any properties that allow us to simplify its operands.
bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
KnownBits Known(BitWidth);
APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
0, &Inst);
if (!V) return false;
if (V == &Inst) return true;
replaceInstUsesWith(Inst, V);
return true;
/// This form of SimplifyDemandedBits simplifies the specified instruction
/// operand if possible, updating it in place. It returns true if it made any
/// change and false otherwise.
bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
const APInt &DemandedMask,
KnownBits &Known,
unsigned Depth) {
Use &U = I->getOperandUse(OpNo);
Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
Depth, I);
if (!NewVal) return false;
if (Instruction* OpInst = dyn_cast<Instruction>(U))
replaceUse(U, NewVal);
return true;
/// This function attempts to replace V with a simpler value based on the
/// demanded bits. When this function is called, it is known that only the bits
/// set in DemandedMask of the result of V are ever used downstream.
/// Consequently, depending on the mask and V, it may be possible to replace V
/// with a constant or one of its operands. In such cases, this function does
/// the replacement and returns true. In all other cases, it returns false after
/// analyzing the expression and setting KnownOne and known to be one in the
/// expression. Known.Zero contains all the bits that are known to be zero in
/// the expression. These are provided to potentially allow the caller (which
/// might recursively be SimplifyDemandedBits itself) to simplify the
/// expression.
/// Known.One and Known.Zero always follow the invariant that:
/// Known.One & Known.Zero == 0.
/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and
/// Known.Zero may only be accurate for those bits set in DemandedMask. Note
/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all
/// be the same.
/// This returns null if it did not change anything and it permits no
/// simplification. This returns V itself if it did some simplification of V's
/// operands based on the information about what bits are demanded. This returns
/// some other non-null value if it found out that V is equal to another value
/// in the context where the specified bits are demanded, but not for all users.
Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
KnownBits &Known, unsigned Depth,
Instruction *CxtI) {
assert(V != nullptr && "Null pointer of Value???");
assert(Depth <= 6 && "Limit Search Depth");
uint32_t BitWidth = DemandedMask.getBitWidth();
Type *VTy = V->getType();
(!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) &&
Known.getBitWidth() == BitWidth &&
"Value *V, DemandedMask and Known must have same BitWidth");
if (isa<Constant>(V)) {
computeKnownBits(V, Known, Depth, CxtI);
return nullptr;
if (DemandedMask.isNullValue()) // Not demanding any bits from V.
return UndefValue::get(VTy);
if (Depth == 6) // Limit search depth.
return nullptr;
Instruction *I = dyn_cast<Instruction>(V);
if (!I) {
computeKnownBits(V, Known, Depth, CxtI);
return nullptr; // Only analyze instructions.
// If there are multiple uses of this value and we aren't at the root, then
// we can't do any simplifications of the operands, because DemandedMask
// only reflects the bits demanded by *one* of the users.
if (Depth != 0 && !I->hasOneUse())
return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI);
KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth);
// If this is the root being simplified, allow it to have multiple uses,
// just set the DemandedMask to all bits so that we can try to simplify the
// operands. This allows visitTruncInst (for example) to simplify the
// operand of a trunc without duplicating all the logic below.
if (Depth == 0 && !V->hasOneUse())
switch (I->getOpcode()) {
computeKnownBits(I, Known, Depth, CxtI);
case Instruction::And: {
// If either the LHS or the RHS are Zero, the result is zero.
if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown,
Depth + 1))
return I;
assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
Known = LHSKnown & RHSKnown;
// If the client is only demanding bits that we know, return the known
// constant.
if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
return Constant::getIntegerValue(VTy, Known.One);
// If all of the demanded bits are known 1 on one side, return the other.
// These bits cannot contribute to the result of the 'and'.
if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
return I->getOperand(0);
if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
return I->getOperand(1);
// If the RHS is a constant, see if we can simplify it.
if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))
return I;
case Instruction::Or: {
// If either the LHS or the RHS are One, the result is One.
if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown,
Depth + 1))
return I;
assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
Known = LHSKnown | RHSKnown;
// If the client is only demanding bits that we know, return the known
// constant.
if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
return Constant::getIntegerValue(VTy, Known.One);
// If all of the demanded bits are known zero on one side, return the other.
// These bits cannot contribute to the result of the 'or'.
if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
return I->getOperand(0);
if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
return I->getOperand(1);
// If the RHS is a constant, see if we can simplify it.
if (ShrinkDemandedConstant(I, 1, DemandedMask))
return I;
case Instruction::Xor: {
if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1))
return I;
assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
Known = LHSKnown ^ RHSKnown;
// If the client is only demanding bits that we know, return the known
// constant.
if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
return Constant::getIntegerValue(VTy, Known.One);
// If all of the demanded bits are known zero on one side, return the other.
// These bits cannot contribute to the result of the 'xor'.
if (DemandedMask.isSubsetOf(RHSKnown.Zero))
return I->getOperand(0);
if (DemandedMask.isSubsetOf(LHSKnown.Zero))
return I->getOperand(1);
// If all of the demanded bits are known to be zero on one side or the
// other, turn this into an *inclusive* or.
// e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) {
Instruction *Or =
BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
return InsertNewInstWith(Or, *I);
// If all of the demanded bits on one side are known, and all of the set
// bits on that side are also known to be set on the other side, turn this
// into an AND, as we know the bits will be cleared.
// e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) &&
RHSKnown.One.isSubsetOf(LHSKnown.One)) {
Constant *AndC = Constant::getIntegerValue(VTy,
~RHSKnown.One & DemandedMask);
Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
return InsertNewInstWith(And, *I);
// If the RHS is a constant, see if we can simplify it.
// FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
if (ShrinkDemandedConstant(I, 1, DemandedMask))
return I;
// If our LHS is an 'and' and if it has one use, and if any of the bits we
// are flipping are known to be set, then the xor is just resetting those
// bits to zero. We can just knock out bits from the 'and' and the 'xor',
// simplifying both of them.
if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0)))
if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
isa<ConstantInt>(I->getOperand(1)) &&
isa<ConstantInt>(LHSInst->getOperand(1)) &&
(LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1));
ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1));
APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);
Constant *AndC =
ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
InsertNewInstWith(NewAnd, *I);
Constant *XorC =
ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
return InsertNewInstWith(NewXor, *I);
case Instruction::Select: {
Value *LHS, *RHS;
SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
if (SPF == SPF_UMAX) {
// UMax(A, C) == A if ...
// The lowest non-zero bit of DemandMask is higher than the highest
// non-zero bit of C.
const APInt *C;
unsigned CTZ = DemandedMask.countTrailingZeros();
if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
return LHS;
} else if (SPF == SPF_UMIN) {
// UMin(A, C) == A if ...
// The lowest non-zero bit of DemandMask is higher than the highest
// non-one bit of C.
// This comes from using DeMorgans on the above umax example.
const APInt *C;
unsigned CTZ = DemandedMask.countTrailingZeros();
if (match(RHS, m_APInt(C)) &&
CTZ >= C->getBitWidth() - C->countLeadingOnes())
return LHS;
// If this is a select as part of any other min/max pattern, don't simplify
// any further in case we break the structure.
return nullptr;
if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
return I;
assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
// If the operands are constants, see if we can simplify them.
// This is similar to ShrinkDemandedConstant, but for a select we want to
// try to keep the selected constants the same as icmp value constants, if
// we can. This helps not break apart (or helps put back together)
// canonical patterns like min and max.
auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo,
APInt DemandedMask) {
const APInt *SelC;
if (!match(I->getOperand(OpNo), m_APInt(SelC)))
return false;
// Get the constant out of the ICmp, if there is one.
const APInt *CmpC;
ICmpInst::Predicate Pred;
if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) ||
CmpC->getBitWidth() != SelC->getBitWidth())
return ShrinkDemandedConstant(I, OpNo, DemandedMask);
// If the constant is already the same as the ICmp, leave it as-is.
if (*CmpC == *SelC)
return false;
// If the constants are not already the same, but can be with the demand
// mask, use the constant value from the ICmp.
if ((*CmpC & DemandedMask) == (*SelC & DemandedMask)) {
I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC));
return true;
return ShrinkDemandedConstant(I, OpNo, DemandedMask);
if (CanonicalizeSelectConstant(I, 1, DemandedMask) ||
CanonicalizeSelectConstant(I, 2, DemandedMask))
return I;
// Only known if known in both the LHS and RHS.
Known.One = RHSKnown.One & LHSKnown.One;
Known.Zero = RHSKnown.Zero & LHSKnown.Zero;
case Instruction::ZExt:
case Instruction::Trunc: {
unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
KnownBits InputKnown(SrcBitWidth);
if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
return I;
assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?");
Known = InputKnown.zextOrTrunc(BitWidth);
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
case Instruction::BitCast:
if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
return nullptr; // vector->int or fp->int?
if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
if (VectorType *SrcVTy =
dyn_cast<VectorType>(I->getOperand(0)->getType())) {
if (DstVTy->getNumElements() != SrcVTy->getNumElements())
// Don't touch a bitcast between vectors of different element counts.
return nullptr;
} else
// Don't touch a scalar-to-vector bitcast.
return nullptr;
} else if (I->getOperand(0)->getType()->isVectorTy())
// Don't touch a vector-to-scalar bitcast.
return nullptr;
if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
return I;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
case Instruction::SExt: {
// Compute the bits in the result that are not present in the input.
unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth);
// If any of the sign extended bits are demanded, we know that the sign
// bit is demanded.
if (DemandedMask.getActiveBits() > SrcBitWidth)
KnownBits InputKnown(SrcBitWidth);
if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1))
return I;
// If the input sign bit is known zero, or if the NewBits are not demanded
// convert this into a zero extension.
if (InputKnown.isNonNegative() ||
DemandedMask.getActiveBits() <= SrcBitWidth) {
// Convert to ZExt cast.
CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
return InsertNewInstWith(NewCast, *I);
// If the sign bit of the input is known set or clear, then we know the
// top bits of the result.
Known = InputKnown.sext(BitWidth);
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
case Instruction::Add:
if ((DemandedMask & 1) == 0) {
// If we do not need the low bit, try to convert bool math to logic:
// add iN (zext i1 X), (sext i1 Y) --> sext (~X & Y) to iN
Value *X, *Y;
if (match(I, m_c_Add(m_OneUse(m_ZExt(m_Value(X))),
m_OneUse(m_SExt(m_Value(Y))))) &&
X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
// Truth table for inputs and output signbits:
// X:0 | X:1
// ----------
// Y:0 | 0 | 0 |
// Y:1 | -1 | 0 |
// ----------
IRBuilderBase::InsertPointGuard Guard(Builder);
Value *AndNot = Builder.CreateAnd(Builder.CreateNot(X), Y);
return Builder.CreateSExt(AndNot, VTy);
// add iN (sext i1 X), (sext i1 Y) --> sext (X | Y) to iN
// TODO: Relax the one-use checks because we are removing an instruction?
if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))),
m_OneUse(m_SExt(m_Value(Y))))) &&
X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
// Truth table for inputs and output signbits:
// X:0 | X:1
// -----------
// Y:0 | -1 | -1 |
// Y:1 | -1 | 0 |
// -----------
IRBuilderBase::InsertPointGuard Guard(Builder);
Value *Or = Builder.CreateOr(X, Y);
return Builder.CreateSExt(Or, VTy);
case Instruction::Sub: {
/// If the high-bits of an ADD/SUB are not demanded, then we do not care
/// about the high bits of the operands.
unsigned NLZ = DemandedMask.countLeadingZeros();
// Right fill the mask of bits for this ADD/SUB to demand the most
// significant bit and all those below it.
APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) ||
ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
if (NLZ > 0) {
// Disable the nsw and nuw flags here: We can no longer guarantee that
// we won't wrap after simplification. Removing the nsw/nuw flags is
// legal here because the top bit is not demanded.
BinaryOperator &BinOP = *cast<BinaryOperator>(I);
return I;
// If we are known to be adding/subtracting zeros to every bit below
// the highest demanded bit, we just return the other side.
if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
return I->getOperand(0);
// We can't do this with the LHS for subtraction, unless we are only
// demanding the LSB.
if ((I->getOpcode() == Instruction::Add ||
DemandedFromOps.isOneValue()) &&
return I->getOperand(1);
// Otherwise just compute the known bits of the result.
bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
Known = KnownBits::computeForAddSub(I->getOpcode() == Instruction::Add,
NSW, LHSKnown, RHSKnown);
case Instruction::Shl: {
const APInt *SA;
if (match(I->getOperand(1), m_APInt(SA))) {
const APInt *ShrAmt;
if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt))))
if (Instruction *Shr = dyn_cast<Instruction>(I->getOperand(0)))
if (Value *R = simplifyShrShlDemandedBits(Shr, *ShrAmt, I, *SA,
DemandedMask, Known))
return R;
uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
// If the shift is NUW/NSW, then it does demand the high bits.
ShlOperator *IOp = cast<ShlOperator>(I);
if (IOp->hasNoSignedWrap())
else if (IOp->hasNoUnsignedWrap())
if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
return I;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
bool SignBitZero = Known.Zero.isSignBitSet();
bool SignBitOne = Known.One.isSignBitSet();
Known.Zero <<= ShiftAmt;
Known.One <<= ShiftAmt;
// low bits known zero.
if (ShiftAmt)
// If this shift has "nsw" keyword, then the result is either a poison
// value or has the same sign bit as the first operand.
if (IOp->hasNoSignedWrap()) {
if (SignBitZero)
else if (SignBitOne)
if (Known.hasConflict())
return UndefValue::get(I->getType());
} else {
computeKnownBits(I, Known, Depth, CxtI);
case Instruction::LShr: {
const APInt *SA;
if (match(I->getOperand(1), m_APInt(SA))) {
uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
// Unsigned shift right.
APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
// If the shift is exact, then it does demand the low bits (and knows that
// they are zero).
if (cast<LShrOperator>(I)->isExact())
if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
return I;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
if (ShiftAmt)
Known.Zero.setHighBits(ShiftAmt); // high bits known zero.
} else {
computeKnownBits(I, Known, Depth, CxtI);
case Instruction::AShr: {
// If this is an arithmetic shift right and only the low-bit is set, we can
// always convert this into a logical shr, even if the shift amount is
// variable. The low bit of the shift cannot be an input sign bit unless
// the shift amount is >= the size of the datatype, which is undefined.
if (DemandedMask.isOneValue()) {
// Perform the logical shift right.
Instruction *NewVal = BinaryOperator::CreateLShr(
I->getOperand(0), I->getOperand(1), I->getName());
return InsertNewInstWith(NewVal, *I);
// If the sign bit is the only bit demanded by this ashr, then there is no
// need to do it, the shift doesn't change the high bit.
if (DemandedMask.isSignMask())
return I->getOperand(0);
const APInt *SA;
if (match(I->getOperand(1), m_APInt(SA))) {
uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
// Signed shift right.
APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
// If any of the high bits are demanded, we should set the sign bit as
// demanded.
if (DemandedMask.countLeadingZeros() <= ShiftAmt)
// If the shift is exact, then it does demand the low bits (and knows that
// they are zero).
if (cast<AShrOperator>(I)->isExact())
if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
return I;
unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI);
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
// Compute the new bits that are at the top now plus sign bits.
APInt HighBits(APInt::getHighBitsSet(
BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth)));
// If the input sign bit is known to be zero, or if none of the top bits
// are demanded, turn this into an unsigned shift right.
assert(BitWidth > ShiftAmt && "Shift amount not saturated?");
if (Known.Zero[BitWidth-ShiftAmt-1] ||
!DemandedMask.intersects(HighBits)) {
BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0),
return InsertNewInstWith(LShr, *I);
} else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.
Known.One |= HighBits;
} else {
computeKnownBits(I, Known, Depth, CxtI);
case Instruction::UDiv: {
// UDiv doesn't demand low bits that are zero in the divisor.
const APInt *SA;
if (match(I->getOperand(1), m_APInt(SA))) {
// If the shift is exact, then it does demand the low bits.
if (cast<UDivOperator>(I)->isExact())
// FIXME: Take the demanded mask of the result into account.
unsigned RHSTrailingZeros = SA->countTrailingZeros();
APInt DemandedMaskIn =
APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros);
if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1))
return I;
// Propagate zero bits from the input.
BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros));
} else {
computeKnownBits(I, Known, Depth, CxtI);
case Instruction::SRem:
if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
// X % -1 demands all the bits because we don't want to introduce
// INT_MIN % -1 (== undef) by accident.
if (Rem->isMinusOne())
APInt RA = Rem->getValue().abs();
if (RA.isPowerOf2()) {
if (DemandedMask.ult(RA)) // srem won't affect demanded bits
return I->getOperand(0);
APInt LowBits = RA - 1;
APInt Mask2 = LowBits | APInt::getSignMask(BitWidth);
if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1))
return I;
// The low bits of LHS are unchanged by the srem.
Known.Zero = LHSKnown.Zero & LowBits;
Known.One = LHSKnown.One & LowBits;
// If LHS is non-negative or has all low bits zero, then the upper bits
// are all zero.
if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero))
Known.Zero |= ~LowBits;
// If LHS is negative and not all low bits are zero, then the upper bits
// are all one.
if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
Known.One |= ~LowBits;
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
// The sign bit is the LHS's sign bit, except when the result of the
// remainder is zero.
if (DemandedMask.isSignBitSet()) {
computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI);
// If it's known zero, our sign bit is also zero.
if (LHSKnown.isNonNegative())
case Instruction::URem: {
KnownBits Known2(BitWidth);
APInt AllOnes = APInt::getAllOnesValue(BitWidth);
if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) ||
SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
return I;
unsigned Leaders = Known2.countMinLeadingZeros();
Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
case Instruction::Call: {
bool KnownBitsComputed = false;
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::bswap: {
// If the only bits demanded come from one byte of the bswap result,
// just shift the input byte into position to eliminate the bswap.
unsigned NLZ = DemandedMask.countLeadingZeros();
unsigned NTZ = DemandedMask.countTrailingZeros();
// Round NTZ down to the next byte. If we have 11 trailing zeros, then
// we need all the bits down to bit 8. Likewise, round NLZ. If we
// have 14 leading zeros, round to 8.
NLZ &= ~7;
NTZ &= ~7;
// If we need exactly one byte, we can do this transformation.
if (BitWidth-NLZ-NTZ == 8) {
unsigned ResultBit = NTZ;
unsigned InputBit = BitWidth-NTZ-8;
// Replace this with either a left or right shift to get the byte into
// the right place.
Instruction *NewVal;
if (InputBit > ResultBit)
NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
ConstantInt::get(I->getType(), InputBit-ResultBit));
NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
ConstantInt::get(I->getType(), ResultBit-InputBit));
return InsertNewInstWith(NewVal, *I);
case Intrinsic::fshr:
case Intrinsic::fshl: {
const APInt *SA;
if (!match(I->getOperand(2), m_APInt(SA)))
// Normalize to funnel shift left. APInt shifts of BitWidth are well-
// defined, so no need to special-case zero shifts here.
uint64_t ShiftAmt = SA->urem(BitWidth);
if (II->getIntrinsicID() == Intrinsic::fshr)
ShiftAmt = BitWidth - ShiftAmt;
APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))
return I;
Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
Known.One = LHSKnown.One.shl(ShiftAmt) |
RHSKnown.One.lshr(BitWidth - ShiftAmt);
KnownBitsComputed = true;
case Intrinsic::x86_mmx_pmovmskb:
case Intrinsic::x86_sse_movmsk_ps:
case Intrinsic::x86_sse2_movmsk_pd:
case Intrinsic::x86_sse2_pmovmskb_128:
case Intrinsic::x86_avx_movmsk_ps_256:
case Intrinsic::x86_avx_movmsk_pd_256:
case Intrinsic::x86_avx2_pmovmskb: {
// MOVMSK copies the vector elements' sign bits to the low bits
// and zeros the high bits.
unsigned ArgWidth;
if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
} else {
auto Arg = II->getArgOperand(0);
auto ArgType = cast<VectorType>(Arg->getType());
ArgWidth = ArgType->getNumElements();
// If we don't need any of low bits then return zero,
// we know that DemandedMask is non-zero already.
APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
if (DemandedElts.isNullValue())
return ConstantInt::getNullValue(VTy);
// We know that the upper bits are set to zero.
KnownBitsComputed = true;
case Intrinsic::x86_sse42_crc32_64_64:
KnownBitsComputed = true;
if (!KnownBitsComputed)
computeKnownBits(V, Known, Depth, CxtI);
// If the client is only demanding bits that we know, return the known
// constant.
if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
return Constant::getIntegerValue(VTy, Known.One);
return nullptr;
/// Helper routine of SimplifyDemandedUseBits. It computes Known
/// bits. It also tries to handle simplifications that can be done based on
/// DemandedMask, but without modifying the Instruction.
Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
const APInt &DemandedMask,
KnownBits &Known,
unsigned Depth,
Instruction *CxtI) {
unsigned BitWidth = DemandedMask.getBitWidth();
Type *ITy = I->getType();
KnownBits LHSKnown(BitWidth);
KnownBits RHSKnown(BitWidth);
// Despite the fact that we can't simplify this instruction in all User's
// context, we can at least compute the known bits, and we can
// do simplifications that apply to *just* the one user if we know that
// this instruction has a simpler value in that context.
switch (I->getOpcode()) {
case Instruction::And: {
// If either the LHS or the RHS are Zero, the result is zero.
computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
Known = LHSKnown & RHSKnown;
// If the client is only demanding bits that we know, return the known
// constant.
if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
return Constant::getIntegerValue(ITy, Known.One);
// If all of the demanded bits are known 1 on one side, return the other.
// These bits cannot contribute to the result of the 'and' in this
// context.
if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
return I->getOperand(0);
if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
return I->getOperand(1);
case Instruction::Or: {
// We can simplify (X|Y) -> X or Y in the user's context if we know that
// only bits from X or Y are demanded.
// If either the LHS or the RHS are One, the result is One.
computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
Known = LHSKnown | RHSKnown;
// If the client is only demanding bits that we know, return the known
// constant.
if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
return Constant::getIntegerValue(ITy, Known.One);
// If all of the demanded bits are known zero on one side, return the
// other. These bits cannot contribute to the result of the 'or' in this
// context.
if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
return I->getOperand(0);
if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
return I->getOperand(1);
case Instruction::Xor: {
// We can simplify (X^Y) -> X or Y in the user's context if we know that
// only bits from X or Y are demanded.
computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
Known = LHSKnown ^ RHSKnown;
// If the client is only demanding bits that we know, return the known
// constant.
if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
return Constant::getIntegerValue(ITy, Known.One);
// If all of the demanded bits are known zero on one side, return the
// other.
if (DemandedMask.isSubsetOf(RHSKnown.Zero))
return I->getOperand(0);
if (DemandedMask.isSubsetOf(LHSKnown.Zero))
return I->getOperand(1);
// Compute the Known bits to simplify things downstream.
computeKnownBits(I, Known, Depth, CxtI);
// If this user is only demanding bits that we know, return the known
// constant.
if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
return Constant::getIntegerValue(ITy, Known.One);
return nullptr;
/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
/// of "C2-C1".
/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
/// ..., bn}, without considering the specific value X is holding.
/// This transformation is legal iff one of following conditions is hold:
/// 1) All the bit in S are 0, in this case E1 == E2.
/// 2) We don't care those bits in S, per the input DemandedMask.
/// 3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
/// rest bits.
/// Currently we only test condition 2).
/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
/// not successful.
Value *
InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
Instruction *Shl, const APInt &ShlOp1,
const APInt &DemandedMask,
KnownBits &Known) {
if (!ShlOp1 || !ShrOp1)
return nullptr; // No-op.
Value *VarX = Shr->getOperand(0);
Type *Ty = VarX->getType();
unsigned BitWidth = Ty->getScalarSizeInBits();
if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
return nullptr; // Undef.
unsigned ShlAmt = ShlOp1.getZExtValue();
unsigned ShrAmt = ShrOp1.getZExtValue();
Known.Zero.setLowBits(ShlAmt - 1);
Known.Zero &= DemandedMask;
APInt BitMask1(APInt::getAllOnesValue(BitWidth));
APInt BitMask2(APInt::getAllOnesValue(BitWidth));
bool isLshr = (Shr->getOpcode() == Instruction::LShr);
BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
(BitMask1.ashr(ShrAmt) << ShlAmt);
if (ShrAmt <= ShlAmt) {
BitMask2 <<= (ShlAmt - ShrAmt);
} else {
BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
BitMask2.ashr(ShrAmt - ShlAmt);
// Check if condition-2 (see the comment to this function) is satified.
if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
if (ShrAmt == ShlAmt)
return VarX;
if (!Shr->hasOneUse())
return nullptr;
BinaryOperator *New;
if (ShrAmt < ShlAmt) {
Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
New = BinaryOperator::CreateShl(VarX, Amt);
BinaryOperator *Orig = cast<BinaryOperator>(Shl);
} else {
Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
BinaryOperator::CreateAShr(VarX, Amt);
if (cast<BinaryOperator>(Shr)->isExact())
return InsertNewInstWith(New, *Shl);
return nullptr;
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
/// struct returns.
Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
APInt DemandedElts,
int DMaskIdx) {
- // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
- if (DMaskIdx < 0 &&
- II->getType()->getScalarSizeInBits() != 32 &&
- DemandedElts.getActiveBits() == 3)
- return nullptr;
auto *IIVTy = cast<VectorType>(II->getType());
unsigned VWidth = IIVTy->getNumElements();
if (VWidth == 1)
return nullptr;
IRBuilderBase::InsertPointGuard Guard(Builder);
// Assume the arguments are unchanged and later override them, if needed.
SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
if (DMaskIdx < 0) {
// Buffer case.
const unsigned ActiveBits = DemandedElts.getActiveBits();
const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
// Start assuming the prefix of elements is demanded, but possibly clear
// some other bits if there are trailing zeros (unused components at front)
// and update offset.
DemandedElts = (1 << ActiveBits) - 1;
if (UnusedComponentsAtFront > 0) {
static const unsigned InvalidOffsetIdx = 0xf;
unsigned OffsetIdx;
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_raw_buffer_load:
OffsetIdx = 1;
case Intrinsic::amdgcn_s_buffer_load:
// If resulting type is vec3, there is no point in trimming the
// load with updated offset, as the vec3 would most likely be widened to
// vec4 anyway during lowering.
if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
OffsetIdx = InvalidOffsetIdx;
OffsetIdx = 1;
case Intrinsic::amdgcn_struct_buffer_load:
OffsetIdx = 2;
// TODO: handle tbuffer* intrinsics.
OffsetIdx = InvalidOffsetIdx;
if (OffsetIdx != InvalidOffsetIdx) {
// Clear demanded bits and update the offset.
DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
auto *Offset = II->getArgOperand(OffsetIdx);
unsigned SingleComponentSizeInBits =
unsigned OffsetAdd =
UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal);
} else {
// Image case.
ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
// Mask off values that are undefined because the dmask doesn't cover them
DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
unsigned NewDMaskVal = 0;
unsigned OrigLoadIdx = 0;
for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
const unsigned Bit = 1 << SrcIdx;
if (!!(DMaskVal & Bit)) {
if (!!DemandedElts[OrigLoadIdx])
NewDMaskVal |= Bit;
if (DMaskVal != NewDMaskVal)
Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
unsigned NewNumElts = DemandedElts.countPopulation();
if (!NewNumElts)
return UndefValue::get(II->getType());
+ // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
+ // fully supported.
+ if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
+ return nullptr;
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
if (DMaskIdx >= 0)
II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
return nullptr;
// Validate function argument and return types, extracting overloaded types
// along the way.
SmallVector<Type *, 6> OverloadTys;
if (!Intrinsic::getIntrinsicSignature(II->getCalledFunction(), OverloadTys))
return nullptr;
Module *M = II->getParent()->getParent()->getParent();
Type *EltTy = IIVTy->getElementType();
Type *NewTy =
(NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
OverloadTys[0] = NewTy;
Function *NewIntrin =
Intrinsic::getDeclaration(M, II->getIntrinsicID(), OverloadTys);
CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
if (NewNumElts == 1) {
return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
SmallVector<int, 8> EltMask;
unsigned NewLoadIdx = 0;
for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
if (!!DemandedElts[OrigLoadIdx])
Value *Shuffle =
Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
return Shuffle;
/// The specified value produces a vector with any number of elements.
/// This method analyzes which elements of the operand are undef and returns
/// that information in UndefElts.
/// DemandedElts contains the set of elements that are actually used by the
/// caller, and by default (AllowMultipleUsers equals false) the value is
/// simplified only if it has a single caller. If AllowMultipleUsers is set
/// to true, DemandedElts refers to the union of sets of elements that are
/// used by all callers.
/// If the information about demanded elements can be used to simplify the
/// operation, the operation is simplified, then the resultant value is
/// returned. This returns null if no change was made.
Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
APInt &UndefElts,
unsigned Depth,
bool AllowMultipleUsers) {
// Cannot analyze scalable type. The number of vector elements is not a
// compile-time constant.
if (isa<ScalableVectorType>(V->getType()))
return nullptr;
unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
APInt EltMask(APInt::getAllOnesValue(VWidth));
assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
if (isa<UndefValue>(V)) {
// If the entire vector is undefined, just return this info.
UndefElts = EltMask;
return nullptr;
if (DemandedElts.isNullValue()) { // If nothing is demanded, provide undef.
UndefElts = EltMask;
return UndefValue::get(V->getType());
UndefElts = 0;
if (auto *C = dyn_cast<Constant>(V)) {
// Check if this is identity. If so, return 0 since we are not simplifying
// anything.
if (DemandedElts.isAllOnesValue())
return nullptr;
Type *EltTy = cast<VectorType>(V->getType())->getElementType();
Constant *Undef = UndefValue::get(EltTy);
SmallVector<Constant*, 16> Elts;
for (unsigned i = 0; i != VWidth; ++i) {
if (!DemandedElts[i]) { // If not demanded, set to undef.
Constant *Elt = C->getAggregateElement(i);
if (!Elt) return nullptr;
if (isa<UndefValue>(Elt)) { // Already undef.
} else { // Otherwise, defined.
// If we changed the constant, return it.
Constant *NewCV = ConstantVector::get(Elts);
return NewCV != C ? NewCV : nullptr;
// Limit search depth.
if (Depth == 10)
return nullptr;
if (!AllowMultipleUsers) {
// If multiple users are using the root value, proceed with
// simplification conservatively assuming that all elements
// are needed.
if (!V->hasOneUse()) {
// Quit if we find multiple users of a non-root value though.
// They'll be handled when it's their turn to be visited by
// the main instcombine process.
if (Depth != 0)
// TODO: Just compute the UndefElts information recursively.
return nullptr;
// Conservatively assume that all elements are needed.
DemandedElts = EltMask;
Instruction *I = dyn_cast<Instruction>(V);
if (!I) return nullptr; // Only analyze instructions.
bool MadeChange = false;
auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum,
APInt Demanded, APInt &Undef) {
auto *II = dyn_cast<IntrinsicInst>(Inst);
Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum);
if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) {
replaceOperand(*Inst, OpNum, V);
MadeChange = true;
APInt UndefElts2(VWidth, 0);
APInt UndefElts3(VWidth, 0);
switch (I->getOpcode()) {
default: break;
case Instruction::GetElementPtr: {
// The LangRef requires that struct geps have all constant indices. As
// such, we can't convert any operand to partial undef.
auto mayIndexStructType = [](GetElementPtrInst &GEP) {
for (auto I = gep_type_begin(GEP), E = gep_type_end(GEP);
I != E; I++)
if (I.isStruct())
return true;;
return false;
if (mayIndexStructType(cast<GetElementPtrInst>(*I)))
// Conservatively track the demanded elements back through any vector
// operands we may have. We know there must be at least one, or we
// wouldn't have a vector result to get here. Note that we intentionally
// merge the undef bits here since gepping with either an undef base or
// index results in undef.
for (unsigned i = 0; i < I->getNumOperands(); i++) {
if (isa<UndefValue>(I->getOperand(i))) {
// If the entire vector is undefined, just return this info.
UndefElts = EltMask;
return nullptr;
if (I->getOperand(i)->getType()->isVectorTy()) {
APInt UndefEltsOp(VWidth, 0);
simplifyAndSetOp(I, i, DemandedElts, UndefEltsOp);
UndefElts |= UndefEltsOp;
case Instruction::InsertElement: {
// If this is a variable index, we don't know which element it overwrites.
// demand exactly the same input as we produce.
ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
if (!Idx) {
// Note that we can't propagate undef elt info, because we don't know
// which elt is getting updated.
simplifyAndSetOp(I, 0, DemandedElts, UndefElts2);
// The element inserted overwrites whatever was there, so the input demanded
// set is simpler than the output set.
unsigned IdxNo = Idx->getZExtValue();
APInt PreInsertDemandedElts = DemandedElts;
if (IdxNo < VWidth)
simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts);
// If this is inserting an element that isn't demanded, remove this
// insertelement.
if (IdxNo >= VWidth || !DemandedElts[IdxNo]) {
return I->getOperand(0);
// The inserted element is defined.
case Instruction::ShuffleVector: {
auto *Shuffle = cast<ShuffleVectorInst>(I);
assert(Shuffle->getOperand(0)->getType() ==
Shuffle->getOperand(1)->getType() &&
"Expected shuffle operands to have same type");
unsigned OpWidth =
// Handle trivial case of a splat. Only check the first element of LHS
// operand.
if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) &&
DemandedElts.isAllOnesValue()) {
if (!isa<UndefValue>(I->getOperand(1))) {
I->setOperand(1, UndefValue::get(I->getOperand(1)->getType()));
MadeChange = true;
APInt LeftDemanded(OpWidth, 1);
APInt LHSUndefElts(OpWidth, 0);
simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
if (LHSUndefElts[0])
UndefElts = EltMask;
APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0);
for (unsigned i = 0; i < VWidth; i++) {
if (DemandedElts[i]) {
unsigned MaskVal = Shuffle->getMaskValue(i);
if (MaskVal != -1u) {
assert(MaskVal < OpWidth * 2 &&
"shufflevector mask index out of range!");
if (MaskVal < OpWidth)
RightDemanded.setBit(MaskVal - OpWidth);
APInt LHSUndefElts(OpWidth, 0);
simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
APInt RHSUndefElts(OpWidth, 0);
simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts);
// If this shuffle does not change the vector length and the elements
// demanded by this shuffle are an identity mask, then this shuffle is
// unnecessary.
// We are assuming canonical form for the mask, so the source vector is
// operand 0 and operand 1 is not used.
// Note that if an element is demanded and this shuffle mask is undefined
// for that element, then the shuffle is not considered an identity
// operation. The shuffle prevents poison from the operand vector from
// leaking to the result by replacing poison with an undefined value.
if (VWidth == OpWidth) {
bool IsIdentityShuffle = true;
for (unsigned i = 0; i < VWidth; i++) {
unsigned MaskVal = Shuffle->getMaskValue(i);
if (DemandedElts[i] && i != MaskVal) {
IsIdentityShuffle = false;
if (IsIdentityShuffle)
return Shuffle->getOperand(0);
bool NewUndefElts = false;
unsigned LHSIdx = -1u, LHSValIdx = -1u;
unsigned RHSIdx = -1u, RHSValIdx = -1u;
bool LHSUniform = true;
bool RHSUniform = true;
for (unsigned i = 0; i < VWidth; i++) {
unsigned MaskVal = Shuffle->getMaskValue(i);
if (MaskVal == -1u) {
} else if (!DemandedElts[i]) {
NewUndefElts = true;
} else if (MaskVal < OpWidth) {
if (LHSUndefElts[MaskVal]) {
NewUndefElts = true;
} else {
LHSIdx = LHSIdx == -1u ? i : OpWidth;
LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth;
LHSUniform = LHSUniform && (MaskVal == i);
} else {
if (RHSUndefElts[MaskVal - OpWidth]) {
NewUndefElts = true;
} else {
RHSIdx = RHSIdx == -1u ? i : OpWidth;
RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth;
RHSUniform = RHSUniform && (MaskVal - OpWidth == i);
// Try to transform shuffle with constant vector and single element from
// this constant vector to single insertelement instruction.
// shufflevector V, C, <v1, v2, .., ci, .., vm> ->
// insertelement V, C[ci], ci-n
if (OpWidth == Shuffle->getType()->getNumElements()) {
Value *Op = nullptr;
Constant *Value = nullptr;
unsigned Idx = -1u;
// Find constant vector with the single element in shuffle (LHS or RHS).
if (LHSIdx < OpWidth && RHSUniform) {
if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) {
Op = Shuffle->getOperand(1);
Value = CV->getOperand(LHSValIdx);
Idx = LHSIdx;
if (RHSIdx < OpWidth && LHSUniform) {
if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) {
Op = Shuffle->getOperand(0);
Value = CV->getOperand(RHSValIdx);
Idx = RHSIdx;
// Found constant vector with single element - convert to insertelement.
if (Op && Value) {
Instruction *New = InsertElementInst::Create(
Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx),
InsertNewInstWith(New, *Shuffle);
return New;
if (NewUndefElts) {
// Add additional discovered undefs.
SmallVector<int, 16> Elts;
for (unsigned i = 0; i < VWidth; ++i) {
if (UndefElts[i])
MadeChange = true;
case Instruction::Select: {
// If this is a vector select, try to transform the select condition based
// on the current demanded elements.
SelectInst *Sel = cast<SelectInst>(I);
if (Sel->getCondition()->getType()->isVectorTy()) {
// TODO: We are not doing anything with UndefElts based on this call.
// It is overwritten below based on the other select operands. If an
// element of the select condition is known undef, then we are free to
// choose the output value from either arm of the select. If we know that
// one of those values is undef, then the output can be undef.
simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
// Next, see if we can transform the arms of the select.
APInt DemandedLHS(DemandedElts), DemandedRHS(DemandedElts);
if (auto *CV = dyn_cast<ConstantVector>(Sel->getCondition())) {
for (unsigned i = 0; i < VWidth; i++) {
// isNullValue() always returns false when called on a ConstantExpr.
// Skip constant expressions to avoid propagating incorrect information.
Constant *CElt = CV->getAggregateElement(i);
if (isa<ConstantExpr>(CElt))
// TODO: If a select condition element is undef, we can demand from
// either side. If one side is known undef, choosing that side would
// propagate undef.
if (CElt->isNullValue())
simplifyAndSetOp(I, 1, DemandedLHS, UndefElts2);
simplifyAndSetOp(I, 2, DemandedRHS, UndefElts3);
// Output elements are undefined if the element from each arm is undefined.
// TODO: This can be improved. See comment in select condition handling.
UndefElts = UndefElts2 & UndefElts3;
case Instruction::BitCast: {
// Vector->vector casts only.
VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
if (!VTy) break;
unsigned InVWidth = VTy->getNumElements();
APInt InputDemandedElts(InVWidth, 0);
UndefElts2 = APInt(InVWidth, 0);
unsigned Ratio;
if (VWidth == InVWidth) {
// If we are converting from <4 x i32> -> <4 x f32>, we demand the same
// elements as are demanded of us.
Ratio = 1;
InputDemandedElts = DemandedElts;
} else if ((VWidth % InVWidth) == 0) {
// If the number of elements in the output is a multiple of the number of
// elements in the input then an input element is live if any of the
// corresponding output elements are live.
Ratio = VWidth / InVWidth;
for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
if (DemandedElts[OutIdx])
InputDemandedElts.setBit(OutIdx / Ratio);
} else if ((InVWidth % VWidth) == 0) {
// If the number of elements in the input is a multiple of the number of
// elements in the output then an input element is live if the
// corresponding output element is live.
Ratio = InVWidth / VWidth;
for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
if (DemandedElts[InIdx / Ratio])
} else {
// Unsupported so far.
simplifyAndSetOp(I, 0, InputDemandedElts, UndefElts2);
if (VWidth == InVWidth) {
UndefElts = UndefElts2;
} else if ((VWidth % InVWidth) == 0) {
// If the number of elements in the output is a multiple of the number of
// elements in the input then an output element is undef if the
// corresponding input element is undef.
for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
if (UndefElts2[OutIdx / Ratio])
} else if ((InVWidth % VWidth) == 0) {
// If the number of elements in the input is a multiple of the number of
// elements in the output then an output element is undef if all of the
// corresponding input elements are undef.
for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
if (SubUndef.countPopulation() == Ratio)
} else {
case Instruction::FPTrunc:
case Instruction::FPExt:
simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
case Instruction::Call: {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
if (!II) break;
switch (II->getIntrinsicID()) {
case Intrinsic::masked_gather: // fallthrough
case Intrinsic::masked_load: {
// Subtlety: If we load from a pointer, the pointer must be valid
// regardless of whether the element is demanded. Doing otherwise risks
// segfaults which didn't exist in the original program.
APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)),
if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2)))
for (unsigned i = 0; i < VWidth; i++) {
Constant *CElt = CV->getAggregateElement(i);
if (CElt->isNullValue())
else if (CElt->isAllOnesValue())
if (II->getIntrinsicID() == Intrinsic::masked_gather)
simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);
// Output elements are undefined if the element from both sources are.
// TODO: can strengthen via mask as well.
UndefElts = UndefElts2 & UndefElts3;
case Intrinsic::x86_xop_vfrcz_ss:
case Intrinsic::x86_xop_vfrcz_sd:
// The instructions for these intrinsics are speced to zero upper bits not
// pass them through like other scalar intrinsics. So we shouldn't just
// use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
// Instead we should return a zero vector.
if (!DemandedElts[0]) {
return ConstantAggregateZero::get(II->getType());
// Only the lower element is used.
DemandedElts = 1;
simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
// Only the lower element is undefined. The high elements are zero.
UndefElts = UndefElts[0];
// Unary scalar-as-vector operations that work column-wise.
case Intrinsic::x86_sse_rcp_ss:
case Intrinsic::x86_sse_rsqrt_ss:
simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
// If lowest element of a scalar op isn't used then use Arg0.
if (!DemandedElts[0]) {
return II->getArgOperand(0);
// TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
// checks).
// Binary scalar-as-vector operations that work column-wise. The high
// elements come from operand 0. The low element is a function of both
// operands.
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:
case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse2_min_sd:
case Intrinsic::x86_sse2_max_sd:
case Intrinsic::x86_sse2_cmp_sd: {
simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
// If lowest element of a scalar op isn't used then use Arg0.
if (!DemandedElts[0]) {
return II->getArgOperand(0);
// Only lower element is used for operand 1.
DemandedElts = 1;
simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
// Lower element is undefined if both lower elements are undefined.
// Consider things like undef&0. The result is known zero, not undef.
if (!UndefElts2[0])
// Binary scalar-as-vector operations that work column-wise. The high
// elements come from operand 0 and the low element comes from operand 1.
case Intrinsic::x86_sse41_round_ss:
case Intrinsic::x86_sse41_round_sd: {
// Don't use the low element of operand 0.
APInt DemandedElts2 = DemandedElts;
simplifyAndSetOp(II, 0, DemandedElts2, UndefElts);
// If lowest element of a scalar op isn't used then use Arg0.
if (!DemandedElts[0]) {
return II->getArgOperand(0);
// Only lower element is used for operand 1.
DemandedElts = 1;
simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
// Take the high undef elements from operand 0 and take the lower element
// from operand 1.
UndefElts |= UndefElts2[0];
// Three input scalar-as-vector operations that work column-wise. The high
// elements come from operand 0 and the low element is a function of all
// three inputs.
case Intrinsic::x86_avx512_mask_add_ss_round:
case Intrinsic::x86_avx512_mask_div_ss_round:
case Intrinsic::x86_avx512_mask_mul_ss_round:
case Intrinsic::x86_avx512_mask_sub_ss_round:
case Intrinsic::x86_avx512_mask_max_ss_round:
case Intrinsic::x86_avx512_mask_min_ss_round:
case Intrinsic::x86_avx512_mask_add_sd_round:
case Intrinsic::x86_avx512_mask_div_sd_round:
case Intrinsic::x86_avx512_mask_mul_sd_round:
case Intrinsic::x86_avx512_mask_sub_sd_round:
case Intrinsic::x86_avx512_mask_max_sd_round:
case Intrinsic::x86_avx512_mask_min_sd_round:
simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
// If lowest element of a scalar op isn't used then use Arg0.
if (!DemandedElts[0]) {
return II->getArgOperand(0);
// Only lower element is used for operand 1 and 2.
DemandedElts = 1;
simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
simplifyAndSetOp(II, 2, DemandedElts, UndefElts3);
// Lower element is undefined if all three lower elements are undefined.
// Consider things like undef&0. The result is known zero, not undef.
if (!UndefElts2[0] || !UndefElts3[0])
case Intrinsic::x86_sse2_packssdw_128:
case Intrinsic::x86_sse2_packsswb_128:
case Intrinsic::x86_sse2_packuswb_128:
case Intrinsic::x86_sse41_packusdw:
case Intrinsic::x86_avx2_packssdw:
case Intrinsic::x86_avx2_packsswb:
case Intrinsic::x86_avx2_packusdw:
case Intrinsic::x86_avx2_packuswb:
case Intrinsic::x86_avx512_packssdw_512:
case Intrinsic::x86_avx512_packsswb_512:
case Intrinsic::x86_avx512_packusdw_512:
case Intrinsic::x86_avx512_packuswb_512: {
auto *Ty0 = II->getArgOperand(0)->getType();
unsigned InnerVWidth = cast<VectorType>(Ty0)->getNumElements();
assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
unsigned VWidthPerLane = VWidth / NumLanes;
unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
// Per lane, pack the elements of the first input and then the second.
// e.g.
// v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
// v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
for (int OpNum = 0; OpNum != 2; ++OpNum) {
APInt OpDemandedElts(InnerVWidth, 0);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
unsigned LaneIdx = Lane * VWidthPerLane;
for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
if (DemandedElts[Idx])
OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
// Demand elements from the operand.
APInt OpUndefElts(InnerVWidth, 0);
simplifyAndSetOp(II, OpNum, OpDemandedElts, OpUndefElts);
// Pack the operand's UNDEF elements, one lane at a time.
OpUndefElts = OpUndefElts.zext(VWidth);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
UndefElts |= LaneElts;
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
case Intrinsic::x86_avx512_pshuf_b_512:
case Intrinsic::x86_avx_vpermilvar_ps:
case Intrinsic::x86_avx_vpermilvar_ps_256:
case Intrinsic::x86_avx512_vpermilvar_ps_512:
case Intrinsic::x86_avx_vpermilvar_pd:
case Intrinsic::x86_avx_vpermilvar_pd_256:
case Intrinsic::x86_avx512_vpermilvar_pd_512:
case Intrinsic::x86_avx2_permd:
case Intrinsic::x86_avx2_permps: {
simplifyAndSetOp(II, 1, DemandedElts, UndefElts);
// SSE4A instructions leave the upper 64-bits of the 128-bit result
// in an undefined state.
case Intrinsic::x86_sse4a_extrq:
case Intrinsic::x86_sse4a_extrqi:
case Intrinsic::x86_sse4a_insertq:
case Intrinsic::x86_sse4a_insertqi:
UndefElts.setHighBits(VWidth / 2);
case Intrinsic::amdgcn_buffer_load:
case Intrinsic::amdgcn_buffer_load_format:
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_s_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format:
case Intrinsic::amdgcn_struct_tbuffer_load:
case Intrinsic::amdgcn_tbuffer_load:
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
default: {
if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
} // switch on IntrinsicID
} // case Call
} // switch on Opcode
// TODO: We bail completely on integer div/rem and shifts because they have
// UB/poison potential, but that should be refined.
BinaryOperator *BO;
if (match(I, m_BinOp(BO)) && !BO->isIntDivRem() && !BO->isShift()) {
simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
simplifyAndSetOp(I, 1, DemandedElts, UndefElts2);
// Any change to an instruction with potential poison must clear those flags
// because we can not guarantee those constraints now. Other analysis may
// determine that it is safe to re-apply the flags.
if (MadeChange)
// Output elements are undefined if both are undefined. Consider things
// like undef & 0. The result is known zero, not undef.
UndefElts &= UndefElts2;
// If we've proven all of the lanes undef, return an undef value.
// TODO: Intersect w/demanded lanes
if (UndefElts.isAllOnesValue())
return UndefValue::get(I->getType());;
return MadeChange ? I : nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index b3254c10a0b2..17a5ec3f87fa 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1,3893 +1,3893 @@
//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// InstructionCombining - Combine instructions to form fewer, simple
// instructions. This pass does not modify the CFG. This pass is where
// algebraic simplification happens.
// This pass combines things like:
// %Y = add i32 %X, 1
// %Z = add i32 %Y, 1
// into:
// %Z = add i32 %X, 2
// This is a simple worklist driven algorithm.
// This pass guarantees that the following canonicalizations are performed on
// the program:
// 1. If a binary operator has a constant operand, it is moved to the RHS
// 2. Bitwise operators with constant operands are always grouped so that
// shifts are performed first, then or's, then and's, then xor's.
// 3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
// 4. All cmp instructions on boolean values are replaced with logical ops
// 5. add X, X is represented as (X*2) => (X << 1)
// 6. Multiplies with a power-of-two constant argument are transformed into
// shifts.
// ... etc.
#include "InstCombineInternal.h"
#include "llvm-c/Initialization.h"
#include "llvm-c/Transforms/InstCombine.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetFolder.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DIBuilder.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CBindingWrapping.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/InstCombine/InstCombine.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "instcombine"
STATISTIC(NumCombined , "Number of insts combined");
STATISTIC(NumConstProp, "Number of constant folds");
STATISTIC(NumDeadInst , "Number of dead inst eliminated");
STATISTIC(NumSunkInst , "Number of instructions sunk");
STATISTIC(NumExpand, "Number of expansions");
STATISTIC(NumFactor , "Number of factorizations");
STATISTIC(NumReassoc , "Number of reassociations");
DEBUG_COUNTER(VisitCounter, "instcombine-visit",
"Controls which instructions are visited");
static constexpr unsigned InstCombineDefaultMaxIterations = 1000;
static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000;
static cl::opt<bool>
EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
static cl::opt<unsigned> LimitMaxIterations(
cl::desc("Limit the maximum number of instruction combining iterations"),
static cl::opt<unsigned> InfiniteLoopDetectionThreshold(
cl::desc("Number of instruction combining iterations considered an "
"infinite loop"),
cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden);
static cl::opt<unsigned>
MaxArraySize("instcombine-maxarray-size", cl::init(1024),
cl::desc("Maximum array size considered when doing a combine"));
// FIXME: Remove this flag when it is no longer necessary to convert
// llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
// increases variable availability at the cost of accuracy. Variables that
// cannot be promoted by mem2reg or SROA will be described as living in memory
// for their entire lifetime. However, passes like DSE and instcombine can
// delete stores to the alloca, leading to misleading and inaccurate debug
// information. This flag can be removed when those passes are fixed.
static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
cl::Hidden, cl::init(true));
Value *InstCombiner::EmitGEPOffset(User *GEP) {
return llvm::EmitGEPOffset(&Builder, DL, GEP);
/// Return true if it is desirable to convert an integer computation from a
/// given bit width to a new bit width.
/// We don't want to convert from a legal to an illegal type or from a smaller
/// to a larger illegal type. A width of '1' is always treated as a legal type
/// because i1 is a fundamental type in IR, and there are many specialized
/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as
/// legal to convert to, in order to open up more combining opportunities.
/// NOTE: this treats i8, i16 and i32 specially, due to them being so common
/// from frontend languages.
bool InstCombiner::shouldChangeType(unsigned FromWidth,
unsigned ToWidth) const {
bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
// Convert to widths of 8, 16 or 32 even if they are not legal types. Only
// shrink types, to prevent infinite loops.
if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32))
return true;
// If this is a legal integer from type, and the result would be an illegal
// type, don't do the transformation.
if (FromLegal && !ToLegal)
return false;
// Otherwise, if both are illegal, do not increase the size of the result. We
// do allow things like i160 -> i64, but not i64 -> i160.
if (!FromLegal && !ToLegal && ToWidth > FromWidth)
return false;
return true;
/// Return true if it is desirable to convert a computation from 'From' to 'To'.
/// We don't want to convert from a legal to an illegal type or from a smaller
/// to a larger illegal type. i1 is always treated as a legal type because it is
/// a fundamental type in IR, and there are many specialized optimizations for
/// i1 types.
bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
// TODO: This could be extended to allow vectors. Datalayout changes might be
// needed to properly support that.
if (!From->isIntegerTy() || !To->isIntegerTy())
return false;
unsigned FromWidth = From->getPrimitiveSizeInBits();
unsigned ToWidth = To->getPrimitiveSizeInBits();
return shouldChangeType(FromWidth, ToWidth);
// Return true, if No Signed Wrap should be maintained for I.
// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
// where both B and C should be ConstantInts, results in a constant that does
// not overflow. This function only handles the Add and Sub opcodes. For
// all other opcodes, the function conservatively returns false.
static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
if (!OBO || !OBO->hasNoSignedWrap())
return false;
// We reason about Add and Sub Only.
Instruction::BinaryOps Opcode = I.getOpcode();
if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
return false;
const APInt *BVal, *CVal;
if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
return false;
bool Overflow = false;
if (Opcode == Instruction::Add)
(void)BVal->sadd_ov(*CVal, Overflow);
(void)BVal->ssub_ov(*CVal, Overflow);
return !Overflow;
static bool hasNoUnsignedWrap(BinaryOperator &I) {
auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
return OBO && OBO->hasNoUnsignedWrap();
static bool hasNoSignedWrap(BinaryOperator &I) {
auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
return OBO && OBO->hasNoSignedWrap();
/// Conservatively clears subclassOptionalData after a reassociation or
/// commutation. We preserve fast-math flags when applicable as they can be
/// preserved.
static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
if (!FPMO) {
FastMathFlags FMF = I.getFastMathFlags();
/// Combine constant operands of associative operations either before or after a
/// cast to eliminate one of the associative operations:
/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) {
auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
if (!Cast || !Cast->hasOneUse())
return false;
// TODO: Enhance logic for other casts and remove this check.
auto CastOpcode = Cast->getOpcode();
if (CastOpcode != Instruction::ZExt)
return false;
// TODO: Enhance logic for other BinOps and remove this check.
if (!BinOp1->isBitwiseLogicOp())
return false;
auto AssocOpcode = BinOp1->getOpcode();
auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0));
if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode)
return false;
Constant *C1, *C2;
if (!match(BinOp1->getOperand(1), m_Constant(C1)) ||
!match(BinOp2->getOperand(1), m_Constant(C2)))
return false;
// TODO: This assumes a zext cast.
// Eg, if it was a trunc, we'd cast C1 to the source type because casting C2
// to the destination type might lose bits.
// Fold the constants together in the destination type:
// (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC)
Type *DestTy = C1->getType();
Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy);
Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2);
IC.replaceOperand(*Cast, 0, BinOp2->getOperand(0));
IC.replaceOperand(*BinOp1, 1, FoldedC);
return true;
/// This performs a few simplifications for operators that are associative or
/// commutative:
/// Commutative operators:
/// 1. Order operands such that they are listed from right (least complex) to
/// left (most complex). This puts constants before unary operators before
/// binary operators.
/// Associative operators:
/// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
/// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
/// Associative and commutative operators:
/// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
/// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
/// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
/// if C1 and C2 are constants.
bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
Instruction::BinaryOps Opcode = I.getOpcode();
bool Changed = false;
do {
// Order operands such that they are listed from right (least complex) to
// left (most complex). This puts constants before unary operators before
// binary operators.
if (I.isCommutative() && getComplexity(I.getOperand(0)) <
Changed = !I.swapOperands();
BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
if (I.isAssociative()) {
// Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
if (Op0 && Op0->getOpcode() == Opcode) {
Value *A = Op0->getOperand(0);
Value *B = Op0->getOperand(1);
Value *C = I.getOperand(1);
// Does "B op C" simplify?
if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
// It simplifies to V. Form "A op V".
replaceOperand(I, 0, A);
replaceOperand(I, 1, V);
bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0);
bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0);
// Conservatively clear all optional flags since they may not be
// preserved by the reassociation. Reset nsw/nuw based on the above
// analysis.
// Note: this is only valid because SimplifyBinOp doesn't look at
// the operands to Op0.
if (IsNUW)
if (IsNSW)
Changed = true;
// Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
if (Op1 && Op1->getOpcode() == Opcode) {
Value *A = I.getOperand(0);
Value *B = Op1->getOperand(0);
Value *C = Op1->getOperand(1);
// Does "A op B" simplify?
if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
// It simplifies to V. Form "V op C".
replaceOperand(I, 0, V);
replaceOperand(I, 1, C);
// Conservatively clear the optional flags, since they may not be
// preserved by the reassociation.
Changed = true;
if (I.isAssociative() && I.isCommutative()) {
if (simplifyAssocCastAssoc(&I, *this)) {
Changed = true;
// Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
if (Op0 && Op0->getOpcode() == Opcode) {
Value *A = Op0->getOperand(0);
Value *B = Op0->getOperand(1);
Value *C = I.getOperand(1);
// Does "C op A" simplify?
if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
// It simplifies to V. Form "V op B".
replaceOperand(I, 0, V);
replaceOperand(I, 1, B);
// Conservatively clear the optional flags, since they may not be
// preserved by the reassociation.
Changed = true;
// Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
if (Op1 && Op1->getOpcode() == Opcode) {
Value *A = I.getOperand(0);
Value *B = Op1->getOperand(0);
Value *C = Op1->getOperand(1);
// Does "C op A" simplify?
if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
// It simplifies to V. Form "B op V".
replaceOperand(I, 0, B);
replaceOperand(I, 1, V);
// Conservatively clear the optional flags, since they may not be
// preserved by the reassociation.
Changed = true;
// Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
// if C1 and C2 are constants.
Value *A, *B;
Constant *C1, *C2;
if (Op0 && Op1 &&
Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) &&
match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) {
bool IsNUW = hasNoUnsignedWrap(I) &&
hasNoUnsignedWrap(*Op0) &&
BinaryOperator *NewBO = (IsNUW && Opcode == Instruction::Add) ?
BinaryOperator::CreateNUW(Opcode, A, B) :
BinaryOperator::Create(Opcode, A, B);
if (isa<FPMathOperator>(NewBO)) {
FastMathFlags Flags = I.getFastMathFlags();
Flags &= Op0->getFastMathFlags();
Flags &= Op1->getFastMathFlags();
InsertNewInstWith(NewBO, I);
replaceOperand(I, 0, NewBO);
replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2));
// Conservatively clear the optional flags, since they may not be
// preserved by the reassociation.
if (IsNUW)
Changed = true;
// No further simplifications.
return Changed;
} while (true);
/// Return whether "X LOp (Y ROp Z)" is always equal to
/// "(X LOp Y) ROp (X LOp Z)".
static bool leftDistributesOverRight(Instruction::BinaryOps LOp,
Instruction::BinaryOps ROp) {
// X & (Y | Z) <--> (X & Y) | (X & Z)
// X & (Y ^ Z) <--> (X & Y) ^ (X & Z)
if (LOp == Instruction::And)
return ROp == Instruction::Or || ROp == Instruction::Xor;
// X | (Y & Z) <--> (X | Y) & (X | Z)
if (LOp == Instruction::Or)
return ROp == Instruction::And;
// X * (Y + Z) <--> (X * Y) + (X * Z)
// X * (Y - Z) <--> (X * Y) - (X * Z)
if (LOp == Instruction::Mul)
return ROp == Instruction::Add || ROp == Instruction::Sub;
return false;
/// Return whether "(X LOp Y) ROp Z" is always equal to
/// "(X ROp Z) LOp (Y ROp Z)".
static bool rightDistributesOverLeft(Instruction::BinaryOps LOp,
Instruction::BinaryOps ROp) {
if (Instruction::isCommutative(ROp))
return leftDistributesOverRight(ROp, LOp);
// (X {&|^} Y) >> Z <--> (X >> Z) {&|^} (Y >> Z) for all shifts.
return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp);
// TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
// but this requires knowing that the addition does not overflow and other
// such subtleties.
/// This function returns identity value for given opcode, which can be used to
/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
if (isa<Constant>(V))
return nullptr;
return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
/// This function predicates factorization using distributive laws. By default,
/// it just returns the 'Op' inputs. But for special-cases like
/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add
/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to
/// allow more factorization opportunities.
static Instruction::BinaryOps
getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,
Value *&LHS, Value *&RHS) {
assert(Op && "Expected a binary operator");
LHS = Op->getOperand(0);
RHS = Op->getOperand(1);
if (TopOpcode == Instruction::Add || TopOpcode == Instruction::Sub) {
Constant *C;
if (match(Op, m_Shl(m_Value(), m_Constant(C)))) {
// X << C --> X * (1 << C)
RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C);
return Instruction::Mul;
// TODO: We can add other conversions e.g. shr => div etc.
return Op->getOpcode();
/// This tries to simplify binary operations by factorizing out common terms
/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
Value *InstCombiner::tryFactorization(BinaryOperator &I,
Instruction::BinaryOps InnerOpcode,
Value *A, Value *B, Value *C, Value *D) {
assert(A && B && C && D && "All values must be provided");
Value *V = nullptr;
Value *SimplifiedInst = nullptr;
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
// Does "X op' Y" always equal "Y op' X"?
bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
// Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode))
// Does the instruction have the form "(A op' B) op (A op' D)" or, in the
// commutative case, "(A op' B) op (C op' A)"?
if (A == C || (InnerCommutative && A == D)) {
if (A != C)
std::swap(C, D);
// Consider forming "A op' (B op D)".
// If "B op D" simplifies then it can be formed with no cost.
V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
// If "B op D" doesn't simplify then only go on if both of the existing
// operations "A op' B" and "C op' D" will be zapped as no longer used.
if (!V && LHS->hasOneUse() && RHS->hasOneUse())
V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
if (V) {
SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V);
// Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
// Does the instruction have the form "(A op' B) op (C op' B)" or, in the
// commutative case, "(A op' B) op (B op' D)"?
if (B == D || (InnerCommutative && B == C)) {
if (B != D)
std::swap(C, D);
// Consider forming "(A op C) op' B".
// If "A op C" simplifies then it can be formed with no cost.
V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
// If "A op C" doesn't simplify then only go on if both of the existing
// operations "A op' B" and "C op' D" will be zapped as no longer used.
if (!V && LHS->hasOneUse() && RHS->hasOneUse())
V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
if (V) {
SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B);
if (SimplifiedInst) {
// Check if we can add NSW/NUW flags to SimplifiedInst. If so, set them.
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) {
if (isa<OverflowingBinaryOperator>(SimplifiedInst)) {
bool HasNSW = false;
bool HasNUW = false;
if (isa<OverflowingBinaryOperator>(&I)) {
HasNSW = I.hasNoSignedWrap();
HasNUW = I.hasNoUnsignedWrap();
if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS)) {
HasNSW &= LOBO->hasNoSignedWrap();
HasNUW &= LOBO->hasNoUnsignedWrap();
if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS)) {
HasNSW &= ROBO->hasNoSignedWrap();
HasNUW &= ROBO->hasNoUnsignedWrap();
if (TopLevelOpcode == Instruction::Add &&
InnerOpcode == Instruction::Mul) {
// We can propagate 'nsw' if we know that
// %Y = mul nsw i16 %X, C
// %Z = add nsw i16 %Y, %X
// =>
// %Z = mul nsw i16 %X, C+1
// iff C+1 isn't INT_MIN
const APInt *CInt;
if (match(V, m_APInt(CInt))) {
if (!CInt->isMinSignedValue())
// nuw can be propagated with any constant or nuw value.
return SimplifiedInst;
/// This tries to simplify binary operations which some other binary operation
/// distributes over either by factorizing out common terms
/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
/// Returns the simplified value, or null if it didn't simplify.
Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
// Factorization.
Value *A, *B, *C, *D;
Instruction::BinaryOps LHSOpcode, RHSOpcode;
if (Op0)
LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
if (Op1)
RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
// The instruction has the form "(A op' B) op (C op' D)". Try to factorize
// a common term.
if (Op0 && Op1 && LHSOpcode == RHSOpcode)
if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D))
return V;
// The instruction has the form "(A op' B) op (C)". Try to factorize common
// term.
if (Op0)
if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
return V;
// The instruction has the form "(B) op (C op' D)". Try to factorize common
// term.
if (Op1)
if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
return V;
// Expansion.
if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
// The instruction has the form "(A op' B) op C". See if expanding it out
// to "(A op C) op' (B op C)" results in simplifications.
Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I));
// Do "A op C" and "B op C" both simplify?
if (L && R) {
// They do! Return "L op' R".
C = Builder.CreateBinOp(InnerOpcode, L, R);
return C;
// Does "A op C" simplify to the identity value for the inner opcode?
if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
// They do! Return "B op C".
C = Builder.CreateBinOp(TopLevelOpcode, B, C);
return C;
// Does "B op C" simplify to the identity value for the inner opcode?
if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
// They do! Return "A op C".
C = Builder.CreateBinOp(TopLevelOpcode, A, C);
return C;
if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
// The instruction has the form "A op (B op' C)". See if expanding it out
// to "(A op B) op' (A op C)" results in simplifications.
Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I));
Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
// Do "A op B" and "A op C" both simplify?
if (L && R) {
// They do! Return "L op' R".
A = Builder.CreateBinOp(InnerOpcode, L, R);
return A;
// Does "A op B" simplify to the identity value for the inner opcode?
if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
// They do! Return "A op C".
A = Builder.CreateBinOp(TopLevelOpcode, A, C);
return A;
// Does "A op C" simplify to the identity value for the inner opcode?
if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
// They do! Return "A op B".
A = Builder.CreateBinOp(TopLevelOpcode, A, B);
return A;
return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
Value *LHS, Value *RHS) {
Value *A, *B, *C, *D, *E, *F;
bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
if (!LHSIsSelect && !RHSIsSelect)
return nullptr;
FastMathFlags FMF;
BuilderTy::FastMathFlagGuard Guard(Builder);
if (isa<FPMathOperator>(&I)) {
FMF = I.getFastMathFlags();
Instruction::BinaryOps Opcode = I.getOpcode();
SimplifyQuery Q = SQ.getWithInstruction(&I);
Value *Cond, *True = nullptr, *False = nullptr;
if (LHSIsSelect && RHSIsSelect && A == D) {
// (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
Cond = A;
True = SimplifyBinOp(Opcode, B, E, FMF, Q);
False = SimplifyBinOp(Opcode, C, F, FMF, Q);
if (LHS->hasOneUse() && RHS->hasOneUse()) {
if (False && !True)
True = Builder.CreateBinOp(Opcode, B, E);
else if (True && !False)
False = Builder.CreateBinOp(Opcode, C, F);
} else if (LHSIsSelect && LHS->hasOneUse()) {
// (A ? B : C) op Y -> A ? (B op Y) : (C op Y)
Cond = A;
True = SimplifyBinOp(Opcode, B, RHS, FMF, Q);
False = SimplifyBinOp(Opcode, C, RHS, FMF, Q);
} else if (RHSIsSelect && RHS->hasOneUse()) {
// X op (D ? E : F) -> D ? (X op E) : (X op F)
Cond = D;
True = SimplifyBinOp(Opcode, LHS, E, FMF, Q);
False = SimplifyBinOp(Opcode, LHS, F, FMF, Q);
if (!True || !False)
return nullptr;
Value *SI = Builder.CreateSelect(Cond, True, False);
return SI;
/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
/// constant zero (which is the 'negate' form).
Value *InstCombiner::dyn_castNegVal(Value *V) const {
Value *NegV;
if (match(V, m_Neg(m_Value(NegV))))
return NegV;
// Constants can be considered to be negated values if they can be folded.
if (ConstantInt *C = dyn_cast<ConstantInt>(V))
return ConstantExpr::getNeg(C);
if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
if (C->getType()->getElementType()->isIntegerTy())
return ConstantExpr::getNeg(C);
if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
Constant *Elt = CV->getAggregateElement(i);
if (!Elt)
return nullptr;
if (isa<UndefValue>(Elt))
if (!isa<ConstantInt>(Elt))
return nullptr;
return ConstantExpr::getNeg(CV);
return nullptr;
static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
InstCombiner::BuilderTy &Builder) {
if (auto *Cast = dyn_cast<CastInst>(&I))
return Builder.CreateCast(Cast->getOpcode(), SO, I.getType());
assert(I.isBinaryOp() && "Unexpected opcode for select folding");
// Figure out if the constant is the left or the right argument.
bool ConstIsRHS = isa<Constant>(I.getOperand(1));
Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
if (auto *SOC = dyn_cast<Constant>(SO)) {
if (ConstIsRHS)
return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
Value *Op0 = SO, *Op1 = ConstOperand;
if (!ConstIsRHS)
std::swap(Op0, Op1);
auto *BO = cast<BinaryOperator>(&I);
Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
SO->getName() + ".op");
auto *FPInst = dyn_cast<Instruction>(RI);
if (FPInst && isa<FPMathOperator>(FPInst))
return RI;
Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
// Don't modify shared select instructions.
if (!SI->hasOneUse())
return nullptr;
Value *TV = SI->getTrueValue();
Value *FV = SI->getFalseValue();
if (!(isa<Constant>(TV) || isa<Constant>(FV)))
return nullptr;
// Bool selects with constant operands can be folded to logical ops.
if (SI->getType()->isIntOrIntVectorTy(1))
return nullptr;
// If it's a bitcast involving vectors, make sure it has the same number of
// elements on both sides.
if (auto *BC = dyn_cast<BitCastInst>(&Op)) {
VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
// Verify that either both or neither are vectors.
if ((SrcTy == nullptr) != (DestTy == nullptr))
return nullptr;
// If vectors, verify that they have the same number of elements.
if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
return nullptr;
// Test if a CmpInst instruction is used exclusively by a select as
// part of a minimum or maximum operation. If so, refrain from doing
// any other folding. This helps out other analyses which understand
// non-obfuscated minimum and maximum idioms, such as ScalarEvolution
// and CodeGen. And in this case, at least one of the comparison
// operands has at least one user besides the compare (the select),
// which would often largely negate the benefit of folding anyway.
if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
if (CI->hasOneUse()) {
Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
// FIXME: This is a hack to avoid infinite looping with min/max patterns.
// We have to ensure that vector constants that only differ with
// undef elements are treated as equivalent.
auto areLooselyEqual = [](Value *A, Value *B) {
if (A == B)
return true;
// Test for vector constants.
Constant *ConstA, *ConstB;
if (!match(A, m_Constant(ConstA)) || !match(B, m_Constant(ConstB)))
return false;
// TODO: Deal with FP constants?
if (!A->getType()->isIntOrIntVectorTy() || A->getType() != B->getType())
return false;
// Compare for equality including undefs as equal.
auto *Cmp = ConstantExpr::getCompare(ICmpInst::ICMP_EQ, ConstA, ConstB);
const APInt *C;
return match(Cmp, m_APIntAllowUndef(C)) && C->isOneValue();
if ((areLooselyEqual(TV, Op0) && areLooselyEqual(FV, Op1)) ||
(areLooselyEqual(FV, Op0) && areLooselyEqual(TV, Op1)))
return nullptr;
Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
InstCombiner::BuilderTy &Builder) {
bool ConstIsRHS = isa<Constant>(I->getOperand(1));
Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));
if (auto *InC = dyn_cast<Constant>(InV)) {
if (ConstIsRHS)
return ConstantExpr::get(I->getOpcode(), InC, C);
return ConstantExpr::get(I->getOpcode(), C, InC);
Value *Op0 = InV, *Op1 = C;
if (!ConstIsRHS)
std::swap(Op0, Op1);
Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "");
auto *FPInst = dyn_cast<Instruction>(RI);
if (FPInst && isa<FPMathOperator>(FPInst))
return RI;
Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
unsigned NumPHIValues = PN->getNumIncomingValues();
if (NumPHIValues == 0)
return nullptr;
// We normally only transform phis with a single use. However, if a PHI has
// multiple uses and they are all the same operation, we can fold *all* of the
// uses into the PHI.
if (!PN->hasOneUse()) {
// Walk the use list for the instruction, comparing them to I.
for (User *U : PN->users()) {
Instruction *UI = cast<Instruction>(U);
if (UI != &I && !I.isIdenticalTo(UI))
return nullptr;
// Otherwise, we can replace *all* users with the new PHI we form.
// Check to see if all of the operands of the PHI are simple constants
// (constantint/constantfp/undef). If there is one non-constant value,
// remember the BB it is in. If there is more than one or if *it* is a PHI,
// bail out. We don't do arbitrary constant expressions here because moving
// their computation can be expensive without a cost model.
BasicBlock *NonConstBB = nullptr;
for (unsigned i = 0; i != NumPHIValues; ++i) {
Value *InVal = PN->getIncomingValue(i);
if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal))
if (isa<PHINode>(InVal)) return nullptr; // Itself a phi.
if (NonConstBB) return nullptr; // More than one non-const value.
NonConstBB = PN->getIncomingBlock(i);
// If the InVal is an invoke at the end of the pred block, then we can't
// insert a computation after it without breaking the edge.
if (isa<InvokeInst>(InVal))
if (cast<Instruction>(InVal)->getParent() == NonConstBB)
return nullptr;
// If the incoming non-constant value is in I's block, we will remove one
// instruction, but insert another equivalent one, leading to infinite
// instcombine.
if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI))
return nullptr;
// If there is exactly one non-constant value, we can insert a copy of the
// operation in that block. However, if this is a critical edge, we would be
// inserting the computation on some other paths (e.g. inside a loop). Only
// do this if the pred block is unconditionally branching into the phi block.
if (NonConstBB != nullptr) {
BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
if (!BI || !BI->isUnconditional()) return nullptr;
// Okay, we can do the transformation: create the new PHI node.
PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
InsertNewInstBefore(NewPN, *PN);
// If we are going to have to insert a new computation, do so right before the
// predecessor's terminator.
if (NonConstBB)
// Next, add all of the operands to the PHI.
if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
// We only currently try to fold the condition of a select when it is a phi,
// not the true/false values.
Value *TrueV = SI->getTrueValue();
Value *FalseV = SI->getFalseValue();
BasicBlock *PhiTransBB = PN->getParent();
for (unsigned i = 0; i != NumPHIValues; ++i) {
BasicBlock *ThisBB = PN->getIncomingBlock(i);
Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
Value *InV = nullptr;
// Beware of ConstantExpr: it may eventually evaluate to getNullValue,
// even if currently isNullValue gives false.
Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
// For vector constants, we cannot use isNullValue to fold into
// FalseVInPred versus TrueVInPred. When we have individual nonzero
// elements in the vector, we will incorrectly fold InC to
// `TrueVInPred`.
if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))
InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
else {
// Generate the select in the same block as PN's current incoming block.
// Note: ThisBB need not be the NonConstBB because vector constants
// which are constants by definition are handled here.
// FIXME: This can lead to an increase in IR generation because we might
// generate selects for vector constant phi operand, that could not be
// folded to TrueVInPred or FalseVInPred as done for ConstantInt. For
// non-vector phis, this transformation was always profitable because
// the select would be generated exactly once in the NonConstBB.
InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred,
FalseVInPred, "phi.sel");
NewPN->addIncoming(InV, ThisBB);
} else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
Constant *C = cast<Constant>(I.getOperand(1));
for (unsigned i = 0; i != NumPHIValues; ++i) {
Value *InV = nullptr;
if (auto *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
InV = Builder.CreateCmp(CI->getPredicate(), PN->getIncomingValue(i),
C, "phi.cmp");
NewPN->addIncoming(InV, PN->getIncomingBlock(i));
} else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
for (unsigned i = 0; i != NumPHIValues; ++i) {
Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i),
NewPN->addIncoming(InV, PN->getIncomingBlock(i));
} else {
CastInst *CI = cast<CastInst>(&I);
Type *RetTy = CI->getType();
for (unsigned i = 0; i != NumPHIValues; ++i) {
Value *InV;
if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i),
I.getType(), "phi.cast");
NewPN->addIncoming(InV, PN->getIncomingBlock(i));
for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
Instruction *User = cast<Instruction>(*UI++);
if (User == &I) continue;
replaceInstUsesWith(*User, NewPN);
return replaceInstUsesWith(I, NewPN);
Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
if (!isa<Constant>(I.getOperand(1)))
return nullptr;
if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
return NewSel;
} else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
return NewPhi;
return nullptr;
/// Given a pointer type and a constant offset, determine whether or not there
/// is a sequence of GEP indices into the pointed type that will land us at the
/// specified offset. If so, fill them into NewIndices and return the resultant
/// element type, otherwise return null.
Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
SmallVectorImpl<Value *> &NewIndices) {
Type *Ty = PtrTy->getElementType();
if (!Ty->isSized())
return nullptr;
// Start with the index over the outer type. Note that the type size
// might be zero (even if the offset isn't zero) if the indexed type
// is something like [0 x {int, int}]
Type *IndexTy = DL.getIndexType(PtrTy);
int64_t FirstIdx = 0;
if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
FirstIdx = Offset/TySize;
Offset -= FirstIdx*TySize;
// Handle hosts where % returns negative instead of values [0..TySize).
if (Offset < 0) {
Offset += TySize;
assert(Offset >= 0);
assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx));
// Index into the types. If we fail, set OrigBase to null.
while (Offset) {
// Indexing into tail padding between struct/array elements.
if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty))
return nullptr;
if (StructType *STy = dyn_cast<StructType>(Ty)) {
const StructLayout *SL = DL.getStructLayout(STy);
assert(Offset < (int64_t)SL->getSizeInBytes() &&
"Offset must stay within the indexed type");
unsigned Elt = SL->getElementContainingOffset(Offset);
Offset -= SL->getElementOffset(Elt);
Ty = STy->getElementType(Elt);
} else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
assert(EltSize && "Cannot index into a zero-sized array");
Offset %= EltSize;
Ty = AT->getElementType();
} else {
// Otherwise, we can't index into the middle of this atomic type, bail.
return nullptr;
return Ty;
static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
// If this GEP has only 0 indices, it is the same pointer as
// Src. If Src is not a trivial GEP too, don't combine
// the indices.
if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
return false;
return true;
/// Return a value X such that Val = X * Scale, or null if none.
/// If the multiplication is known not to overflow, then NoSignedWrap is set.
Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
Scale.getBitWidth() && "Scale not compatible with value!");
// If Val is zero or Scale is one then Val = Val * Scale.
if (match(Val, m_Zero()) || Scale == 1) {
NoSignedWrap = true;
return Val;
// If Scale is zero then it does not divide Val.
if (Scale.isMinValue())
return nullptr;
// Look through chains of multiplications, searching for a constant that is
// divisible by Scale. For example, descaling X*(Y*(Z*4)) by a factor of 4
// will find the constant factor 4 and produce X*(Y*Z). Descaling X*(Y*8) by
// a factor of 4 will produce X*(Y*2). The principle of operation is to bore
// down from Val:
// Val = M1 * X || Analysis starts here and works down
// M1 = M2 * Y || Doesn't descend into terms with more
// M2 = Z * 4 \/ than one use
// Then to modify a term at the bottom:
// Val = M1 * X
// M1 = Z * Y || Replaced M2 with Z
// Then to work back up correcting nsw flags.
// Op - the term we are currently analyzing. Starts at Val then drills down.
// Replaced with its descaled value before exiting from the drill down loop.
Value *Op = Val;
// Parent - initially null, but after drilling down notes where Op came from.
// In the example above, Parent is (Val, 0) when Op is M1, because M1 is the
// 0'th operand of Val.
std::pair<Instruction *, unsigned> Parent;
// Set if the transform requires a descaling at deeper levels that doesn't
// overflow.
bool RequireNoSignedWrap = false;
// Log base 2 of the scale. Negative if not a power of 2.
int32_t logScale = Scale.exactLogBase2();
for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// If Op is a constant divisible by Scale then descale to the quotient.
APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth.
APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
if (!Remainder.isMinValue())
// Not divisible by Scale.
return nullptr;
// Replace with the quotient in the parent.
Op = ConstantInt::get(CI->getType(), Quotient);
NoSignedWrap = true;
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) {
if (BO->getOpcode() == Instruction::Mul) {
// Multiplication.
NoSignedWrap = BO->hasNoSignedWrap();
if (RequireNoSignedWrap && !NoSignedWrap)
return nullptr;
// There are three cases for multiplication: multiplication by exactly
// the scale, multiplication by a constant different to the scale, and
// multiplication by something else.
Value *LHS = BO->getOperand(0);
Value *RHS = BO->getOperand(1);
if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
// Multiplication by a constant.
if (CI->getValue() == Scale) {
// Multiplication by exactly the scale, replace the multiplication
// by its left-hand side in the parent.
Op = LHS;
// Otherwise drill down into the constant.
if (!Op->hasOneUse())
return nullptr;
Parent = std::make_pair(BO, 1);
// Multiplication by something else. Drill down into the left-hand side
// since that's where the reassociate pass puts the good stuff.
if (!Op->hasOneUse())
return nullptr;
Parent = std::make_pair(BO, 0);
if (logScale > 0 && BO->getOpcode() == Instruction::Shl &&
isa<ConstantInt>(BO->getOperand(1))) {
// Multiplication by a power of 2.
NoSignedWrap = BO->hasNoSignedWrap();
if (RequireNoSignedWrap && !NoSignedWrap)
return nullptr;
Value *LHS = BO->getOperand(0);
int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
// Op = LHS << Amt.
if (Amt == logScale) {
// Multiplication by exactly the scale, replace the multiplication
// by its left-hand side in the parent.
Op = LHS;
if (Amt < logScale || !Op->hasOneUse())
return nullptr;
// Multiplication by more than the scale. Reduce the multiplying amount
// by the scale in the parent.
Parent = std::make_pair(BO, 1);
Op = ConstantInt::get(BO->getType(), Amt - logScale);
if (!Op->hasOneUse())
return nullptr;
if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
if (Cast->getOpcode() == Instruction::SExt) {
// Op is sign-extended from a smaller type, descale in the smaller type.
unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
APInt SmallScale = Scale.trunc(SmallSize);
// Suppose Op = sext X, and we descale X as Y * SmallScale. We want to
// descale Op as (sext Y) * Scale. In order to have
// sext (Y * SmallScale) = (sext Y) * Scale
// some conditions need to hold however: SmallScale must sign-extend to
// Scale and the multiplication Y * SmallScale should not overflow.
if (SmallScale.sext(Scale.getBitWidth()) != Scale)
// SmallScale does not sign-extend to Scale.
return nullptr;
assert(SmallScale.exactLogBase2() == logScale);
// Require that Y * SmallScale must not overflow.
RequireNoSignedWrap = true;
// Drill down through the cast.
Parent = std::make_pair(Cast, 0);
Scale = SmallScale;
if (Cast->getOpcode() == Instruction::Trunc) {
// Op is truncated from a larger type, descale in the larger type.
// Suppose Op = trunc X, and we descale X as Y * sext Scale. Then
// trunc (Y * sext Scale) = (trunc Y) * Scale
// always holds. However (trunc Y) * Scale may overflow even if
// trunc (Y * sext Scale) does not, so nsw flags need to be cleared
// from this point up in the expression (see later).
if (RequireNoSignedWrap)
return nullptr;
// Drill down through the cast.
unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
Parent = std::make_pair(Cast, 0);
Scale = Scale.sext(LargeSize);
if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits())
logScale = -1;
assert(Scale.exactLogBase2() == logScale);
// Unsupported expression, bail out.
return nullptr;
// If Op is zero then Val = Op * Scale.
if (match(Op, m_Zero())) {
NoSignedWrap = true;
return Op;
// We know that we can successfully descale, so from here on we can safely
// modify the IR. Op holds the descaled version of the deepest term in the
// expression. NoSignedWrap is 'true' if multiplying Op by Scale is known
// not to overflow.
if (!Parent.first)
// The expression only had one term.
return Op;
// Rewrite the parent using the descaled version of its operand.
assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
assert(Op != Parent.first->getOperand(Parent.second) &&
"Descaling was a no-op?");
replaceOperand(*Parent.first, Parent.second, Op);
// Now work back up the expression correcting nsw flags. The logic is based
// on the following observation: if X * Y is known not to overflow as a signed
// multiplication, and Y is replaced by a value Z with smaller absolute value,
// then X * Z will not overflow as a signed multiplication either. As we work
// our way up, having NoSignedWrap 'true' means that the descaled value at the
// current level has strictly smaller absolute value than the original.
Instruction *Ancestor = Parent.first;
do {
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) {
// If the multiplication wasn't nsw then we can't say anything about the
// value of the descaled multiplication, and we have to clear nsw flags
// from this point on up.
bool OpNoSignedWrap = BO->hasNoSignedWrap();
NoSignedWrap &= OpNoSignedWrap;
if (NoSignedWrap != OpNoSignedWrap) {
} else if (Ancestor->getOpcode() == Instruction::Trunc) {
// The fact that the descaled input to the trunc has smaller absolute
// value than the original input doesn't tell us anything useful about
// the absolute values of the truncations.
NoSignedWrap = false;
assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) &&
"Failed to keep proper track of nsw flags while drilling down?");
if (Ancestor == Val)
// Got to the top, all done!
return Val;
// Move up one level in the expression.
assert(Ancestor->hasOneUse() && "Drilled down when more than one use!");
Ancestor = Ancestor->user_back();
} while (true);
Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
// FIXME: some of this is likely fine for scalable vectors
if (!isa<FixedVectorType>(Inst.getType()))
return nullptr;
BinaryOperator::BinaryOps Opcode = Inst.getOpcode();
Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
assert(cast<VectorType>(LHS->getType())->getElementCount() ==
assert(cast<VectorType>(RHS->getType())->getElementCount() ==
// If both operands of the binop are vector concatenations, then perform the
// narrow binop on each pair of the source operands followed by concatenation
// of the results.
Value *L0, *L1, *R0, *R1;
ArrayRef<int> Mask;
if (match(LHS, m_Shuffle(m_Value(L0), m_Value(L1), m_Mask(Mask))) &&
match(RHS, m_Shuffle(m_Value(R0), m_Value(R1), m_SpecificMask(Mask))) &&
LHS->hasOneUse() && RHS->hasOneUse() &&
cast<ShuffleVectorInst>(LHS)->isConcat() &&
cast<ShuffleVectorInst>(RHS)->isConcat()) {
// This transform does not have the speculative execution constraint as
// below because the shuffle is a concatenation. The new binops are
// operating on exactly the same elements as the existing binop.
// TODO: We could ease the mask requirement to allow different undef lanes,
// but that requires an analysis of the binop-with-undef output value.
Value *NewBO0 = Builder.CreateBinOp(Opcode, L0, R0);
if (auto *BO = dyn_cast<BinaryOperator>(NewBO0))
Value *NewBO1 = Builder.CreateBinOp(Opcode, L1, R1);
if (auto *BO = dyn_cast<BinaryOperator>(NewBO1))
return new ShuffleVectorInst(NewBO0, NewBO1, Mask);
// It may not be safe to reorder shuffles and things like div, urem, etc.
// because we may trap when executing those ops on unknown vector elements.
// See PR20059.
if (!isSafeToSpeculativelyExecute(&Inst))
return nullptr;
auto createBinOpShuffle = [&](Value *X, Value *Y, ArrayRef<int> M) {
Value *XY = Builder.CreateBinOp(Opcode, X, Y);
if (auto *BO = dyn_cast<BinaryOperator>(XY))
return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M);
// If both arguments of the binary operation are shuffles that use the same
// mask and shuffle within a single vector, move the shuffle after the binop.
Value *V1, *V2;
if (match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))) &&
match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(Mask))) &&
V1->getType() == V2->getType() &&
(LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) {
// Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask)
return createBinOpShuffle(V1, V2, Mask);
// If both arguments of a commutative binop are select-shuffles that use the
// same mask with commuted operands, the shuffles are unnecessary.
if (Inst.isCommutative() &&
match(LHS, m_Shuffle(m_Value(V1), m_Value(V2), m_Mask(Mask))) &&
m_Shuffle(m_Specific(V2), m_Specific(V1), m_SpecificMask(Mask)))) {
auto *LShuf = cast<ShuffleVectorInst>(LHS);
auto *RShuf = cast<ShuffleVectorInst>(RHS);
// TODO: Allow shuffles that contain undefs in the mask?
// That is legal, but it reduces undef knowledge.
// TODO: Allow arbitrary shuffles by shuffling after binop?
// That might be legal, but we have to deal with poison.
if (LShuf->isSelect() &&
!is_contained(LShuf->getShuffleMask(), UndefMaskElem) &&
RShuf->isSelect() &&
!is_contained(RShuf->getShuffleMask(), UndefMaskElem)) {
// Example:
// LHS = shuffle V1, V2, <0, 5, 6, 3>
// RHS = shuffle V2, V1, <0, 5, 6, 3>
// LHS + RHS --> (V10+V20, V21+V11, V22+V12, V13+V23) --> V1 + V2
Instruction *NewBO = BinaryOperator::Create(Opcode, V1, V2);
return NewBO;
// If one argument is a shuffle within one vector and the other is a constant,
// try moving the shuffle after the binary operation. This canonicalization
// intends to move shuffles closer to other shuffles and binops closer to
// other binops, so they can be folded. It may also enable demanded elements
// transforms.
unsigned NumElts = cast<FixedVectorType>(Inst.getType())->getNumElements();
Constant *C;
if (match(&Inst,
m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))),
- m_Constant(C))) &&
+ m_Constant(C))) && !isa<ConstantExpr>(C) &&
cast<FixedVectorType>(V1->getType())->getNumElements() <= NumElts) {
assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() &&
"Shuffle should not change scalar type");
// Find constant NewC that has property:
// shuffle(NewC, ShMask) = C
// If such constant does not exist (example: ShMask=<0,0> and C=<1,2>)
// reorder is not possible. A 1-to-1 mapping is not required. Example:
// ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
bool ConstOp1 = isa<Constant>(RHS);
ArrayRef<int> ShMask = Mask;
unsigned SrcVecNumElts =
UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType());
SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar);
bool MayChange = true;
for (unsigned I = 0; I < NumElts; ++I) {
Constant *CElt = C->getAggregateElement(I);
if (ShMask[I] >= 0) {
assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle");
Constant *NewCElt = NewVecC[ShMask[I]];
// Bail out if:
// 1. The constant vector contains a constant expression.
// 2. The shuffle needs an element of the constant vector that can't
// be mapped to a new constant vector.
// 3. This is a widening shuffle that copies elements of V1 into the
// extended elements (extending with undef is allowed).
if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt) ||
I >= SrcVecNumElts) {
MayChange = false;
NewVecC[ShMask[I]] = CElt;
// If this is a widening shuffle, we must be able to extend with undef
// elements. If the original binop does not produce an undef in the high
// lanes, then this transform is not safe.
// Similarly for undef lanes due to the shuffle mask, we can only
// transform binops that preserve undef.
// TODO: We could shuffle those non-undef constant values into the
// result by using a constant vector (rather than an undef vector)
// as operand 1 of the new binop, but that might be too aggressive
// for target-independent shuffle creation.
if (I >= SrcVecNumElts || ShMask[I] < 0) {
Constant *MaybeUndef =
ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt)
: ConstantExpr::get(Opcode, CElt, UndefScalar);
if (!isa<UndefValue>(MaybeUndef)) {
MayChange = false;
if (MayChange) {
Constant *NewC = ConstantVector::get(NewVecC);
// It may not be safe to execute a binop on a vector with undef elements
// because the entire instruction can be folded to undef or create poison
// that did not exist in the original code.
if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1))
NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1);
// Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask)
// Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask)
Value *NewLHS = ConstOp1 ? V1 : NewC;
Value *NewRHS = ConstOp1 ? NewC : V1;
return createBinOpShuffle(NewLHS, NewRHS, Mask);
// Try to reassociate to sink a splat shuffle after a binary operation.
if (Inst.isAssociative() && Inst.isCommutative()) {
// Canonicalize shuffle operand as LHS.
if (isa<ShuffleVectorInst>(RHS))
std::swap(LHS, RHS);
Value *X;
ArrayRef<int> MaskC;
int SplatIndex;
BinaryOperator *BO;
if (!match(LHS,
m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(MaskC)))) ||
!match(MaskC, m_SplatOrUndefMask(SplatIndex)) ||
X->getType() != Inst.getType() || !match(RHS, m_OneUse(m_BinOp(BO))) ||
BO->getOpcode() != Opcode)
return nullptr;
// FIXME: This may not be safe if the analysis allows undef elements. By
// moving 'Y' before the splat shuffle, we are implicitly assuming
// that it is not undef/poison at the splat index.
Value *Y, *OtherOp;
if (isSplatValue(BO->getOperand(0), SplatIndex)) {
Y = BO->getOperand(0);
OtherOp = BO->getOperand(1);
} else if (isSplatValue(BO->getOperand(1), SplatIndex)) {
Y = BO->getOperand(1);
OtherOp = BO->getOperand(0);
} else {
return nullptr;
// X and Y are splatted values, so perform the binary operation on those
// values followed by a splat followed by the 2nd binary operation:
// bo (splat X), (bo Y, OtherOp) --> bo (splat (bo X, Y)), OtherOp
Value *NewBO = Builder.CreateBinOp(Opcode, X, Y);
UndefValue *Undef = UndefValue::get(Inst.getType());
SmallVector<int, 8> NewMask(MaskC.size(), SplatIndex);
Value *NewSplat = Builder.CreateShuffleVector(NewBO, Undef, NewMask);
Instruction *R = BinaryOperator::Create(Opcode, NewSplat, OtherOp);
// Intersect FMF on both new binops. Other (poison-generating) flags are
// dropped to be safe.
if (isa<FPMathOperator>(R)) {
if (auto *NewInstBO = dyn_cast<BinaryOperator>(NewBO))
return R;
return nullptr;
/// Try to narrow the width of a binop if at least 1 operand is an extend of
/// of a value. This requires a potentially expensive known bits check to make
/// sure the narrow op does not overflow.
Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) {
// We need at least one extended operand.
Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1);
// If this is a sub, we swap the operands since we always want an extension
// on the RHS. The LHS can be an extension or a constant.
if (BO.getOpcode() == Instruction::Sub)
std::swap(Op0, Op1);
Value *X;
bool IsSext = match(Op0, m_SExt(m_Value(X)));
if (!IsSext && !match(Op0, m_ZExt(m_Value(X))))
return nullptr;
// If both operands are the same extension from the same source type and we
// can eliminate at least one (hasOneUse), this might work.
CastInst::CastOps CastOpc = IsSext ? Instruction::SExt : Instruction::ZExt;
Value *Y;
if (!(match(Op1, m_ZExtOrSExt(m_Value(Y))) && X->getType() == Y->getType() &&
cast<Operator>(Op1)->getOpcode() == CastOpc &&
(Op0->hasOneUse() || Op1->hasOneUse()))) {
// If that did not match, see if we have a suitable constant operand.
// Truncating and extending must produce the same constant.
Constant *WideC;
if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC)))
return nullptr;
Constant *NarrowC = ConstantExpr::getTrunc(WideC, X->getType());
if (ConstantExpr::getCast(CastOpc, NarrowC, BO.getType()) != WideC)
return nullptr;
Y = NarrowC;
// Swap back now that we found our operands.
if (BO.getOpcode() == Instruction::Sub)
std::swap(X, Y);
// Both operands have narrow versions. Last step: the math must not overflow
// in the narrow width.
if (!willNotOverflow(BO.getOpcode(), X, Y, BO, IsSext))
return nullptr;
// bo (ext X), (ext Y) --> ext (bo X, Y)
// bo (ext X), C --> ext (bo X, C')
Value *NarrowBO = Builder.CreateBinOp(BO.getOpcode(), X, Y, "narrow");
if (auto *NewBinOp = dyn_cast<BinaryOperator>(NarrowBO)) {
if (IsSext)
return CastInst::Create(CastOpc, NarrowBO, BO.getType());
static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) {
// At least one GEP must be inbounds.
if (!GEP1.isInBounds() && !GEP2.isInBounds())
return false;
return (GEP1.isInBounds() || GEP1.hasAllZeroIndices()) &&
(GEP2.isInBounds() || GEP2.hasAllZeroIndices());
/// Thread a GEP operation with constant indices through the constant true/false
/// arms of a select.
static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
InstCombiner::BuilderTy &Builder) {
if (!GEP.hasAllConstantIndices())
return nullptr;
Instruction *Sel;
Value *Cond;
Constant *TrueC, *FalseC;
if (!match(GEP.getPointerOperand(), m_Instruction(Sel)) ||
m_Select(m_Value(Cond), m_Constant(TrueC), m_Constant(FalseC))))
return nullptr;
// gep (select Cond, TrueC, FalseC), IndexC --> select Cond, TrueC', FalseC'
// Propagate 'inbounds' and metadata from existing instructions.
// Note: using IRBuilder to create the constants for efficiency.
SmallVector<Value *, 4> IndexC(GEP.idx_begin(), GEP.idx_end());
bool IsInBounds = GEP.isInBounds();
Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(TrueC, IndexC)
: Builder.CreateGEP(TrueC, IndexC);
Value *NewFalseC = IsInBounds ? Builder.CreateInBoundsGEP(FalseC, IndexC)
: Builder.CreateGEP(FalseC, IndexC);
return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
Type *GEPType = GEP.getType();
Type *GEPEltType = GEP.getSourceElementType();
bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
return replaceInstUsesWith(GEP, V);
// For vector geps, use the generic demanded vector support.
// Skip if GEP return type is scalable. The number of elements is unknown at
// compile-time.
if (auto *GEPFVTy = dyn_cast<FixedVectorType>(GEPType)) {
auto VWidth = GEPFVTy->getNumElements();
APInt UndefElts(VWidth, 0);
APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask,
UndefElts)) {
if (V != &GEP)
return replaceInstUsesWith(GEP, V);
return &GEP;
// TODO: 1) Scalarize splat operands, 2) scalarize entire instruction if
// possible (decide on canonical form for pointer broadcast), 3) exploit
// undef elements to decrease demanded bits
Value *PtrOp = GEP.getOperand(0);
// Eliminate unneeded casts for indices, and replace indices which displace
// by multiples of a zero size type with zero.
bool MadeChange = false;
// Index width may not be the same width as pointer width.
// Data layout chooses the right type based on supported integer types.
Type *NewScalarIndexTy =
gep_type_iterator GTI = gep_type_begin(GEP);
for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
++I, ++GTI) {
// Skip indices into struct types.
if (GTI.isStruct())
Type *IndexTy = (*I)->getType();
Type *NewIndexType =
? VectorType::get(NewScalarIndexTy,
: NewScalarIndexTy;
// If the element type has zero size then any index over it is equivalent
// to an index of zero, so replace it with zero if it is not zero already.
Type *EltTy = GTI.getIndexedType();
if (EltTy->isSized() && DL.getTypeAllocSize(EltTy).isZero())
if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
*I = Constant::getNullValue(NewIndexType);
MadeChange = true;
if (IndexTy != NewIndexType) {
// If we are using a wider index than needed for this platform, shrink
// it to what we need. If narrower, sign-extend it to what we need.
// This explicit cast can make subsequent optimizations more obvious.
*I = Builder.CreateIntCast(*I, NewIndexType, true);
MadeChange = true;
if (MadeChange)
return &GEP;
// Check to see if the inputs to the PHI node are getelementptr instructions.
if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
if (!Op1)
return nullptr;
// Don't fold a GEP into itself through a PHI node. This can only happen
// through the back-edge of a loop. Folding a GEP into itself means that
// the value of the previous iteration needs to be stored in the meantime,
// thus requiring an additional register variable to be live, but not
// actually achieving anything (the GEP still needs to be executed once per
// loop iteration).
if (Op1 == &GEP)
return nullptr;
int DI = -1;
for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
return nullptr;
// As for Op1 above, don't try to fold a GEP into itself.
if (Op2 == &GEP)
return nullptr;
// Keep track of the type as we walk the GEP.
Type *CurTy = nullptr;
for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
return nullptr;
if (Op1->getOperand(J) != Op2->getOperand(J)) {
if (DI == -1) {
// We have not seen any differences yet in the GEPs feeding the
// PHI yet, so we record this one if it is allowed to be a
// variable.
// The first two arguments can vary for any GEP, the rest have to be
// static for struct slots
if (J > 1) {
assert(CurTy && "No current type?");
if (CurTy->isStructTy())
return nullptr;
DI = J;
} else {
// The GEP is different by more than one input. While this could be
// extended to support GEPs that vary by more than one variable it
// doesn't make sense since it greatly increases the complexity and
// would result in an R+R+R addressing mode which no backend
// directly supports and would need to be broken into several
// simpler instructions anyway.
return nullptr;
// Sink down a layer of the type for the next iteration.
if (J > 0) {
if (J == 1) {
CurTy = Op1->getSourceElementType();
} else {
CurTy =
GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J));
// If not all GEPs are identical we'll have to create a new PHI node.
// Check that the old PHI node has only one use so that it will get
// removed.
if (DI != -1 && !PN->hasOneUse())
return nullptr;
auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
if (DI == -1) {
// All the GEPs feeding the PHI are identical. Clone one down into our
// BB so that it can be merged with the current GEP.
} else {
// All the GEPs feeding the PHI differ at a single offset. Clone a GEP
// into the current block so it can be merged, and create a new PHI to
// set that index.
PHINode *NewPN;
IRBuilderBase::InsertPointGuard Guard(Builder);
NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
for (auto &I : PN->operands())
NewGEP->setOperand(DI, NewPN);
GEP.getParent()->getFirstInsertionPt(), NewGEP);
replaceOperand(GEP, 0, NewGEP);
PtrOp = NewGEP;
// Combine Indices - If the source pointer to this getelementptr instruction
// is a getelementptr instruction, combine the indices of the two
// getelementptr instructions into a single instruction.
if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
return nullptr;
// Try to reassociate loop invariant GEP chains to enable LICM.
if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
Src->hasOneUse()) {
if (Loop *L = LI->getLoopFor(GEP.getParent())) {
Value *GO1 = GEP.getOperand(1);
Value *SO1 = Src->getOperand(1);
// Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
// invariant: this breaks the dependence between GEPs and allows LICM
// to hoist the invariant part out of the loop.
if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
// We have to be careful here.
// We have something like:
// %src = getelementptr <ty>, <ty>* %base, <ty> %idx
// %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
// If we just swap idx & idx2 then we could inadvertantly
// change %src from a vector to a scalar, or vice versa.
// Cases:
// 1) %base a scalar & idx a scalar & idx2 a vector
// => Swapping idx & idx2 turns %src into a vector type.
// 2) %base a scalar & idx a vector & idx2 a scalar
// => Swapping idx & idx2 turns %src in a scalar type
// 3) %base, %idx, and %idx2 are scalars
// => %src & %gep are scalars
// => swapping idx & idx2 is safe
// 4) %base a vector
// => %src is a vector
// => swapping idx & idx2 is safe.
auto *SO0 = Src->getOperand(0);
auto *SO0Ty = SO0->getType();
if (!isa<VectorType>(GEPType) || // case 3
isa<VectorType>(SO0Ty)) { // case 4
Src->setOperand(1, GO1);
GEP.setOperand(1, SO1);
return &GEP;
} else {
// Case 1 or 2
// -- have to recreate %src & %gep
// put NewSrc at same location as %src
auto *NewSrc = cast<GetElementPtrInst>(
Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()));
auto *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
return NewGEP;
// Note that if our source is a gep chain itself then we wait for that
// chain to be resolved before we perform this transformation. This
// avoids us creating a TON of code in some cases.
if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
return nullptr; // Wait until our source is folded to completion.
SmallVector<Value*, 8> Indices;
// Find out whether the last index in the source GEP is a sequential idx.
bool EndsWithSequential = false;
for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
I != E; ++I)
EndsWithSequential = I.isSequential();
// Can we combine the two pointer arithmetics offsets?
if (EndsWithSequential) {
// Replace: gep (gep %P, long B), long A, ...
// With: T = long A+B; gep %P, T, ...
Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
Value *GO1 = GEP.getOperand(1);
// If they aren't the same type, then the input hasn't been processed
// by the loop above yet (which canonicalizes sequential index types to
// intptr_t). Just avoid transforming this until the input has been
// normalized.
if (SO1->getType() != GO1->getType())
return nullptr;
Value *Sum =
SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
// Only do the combine when we are sure the cost after the
// merge is never more than that before the merge.
if (Sum == nullptr)
return nullptr;
// Update the GEP in place if possible.
if (Src->getNumOperands() == 2) {
GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
replaceOperand(GEP, 0, Src->getOperand(0));
replaceOperand(GEP, 1, Sum);
return &GEP;
Indices.append(Src->op_begin()+1, Src->op_end()-1);
Indices.append(GEP.op_begin()+2, GEP.op_end());
} else if (isa<Constant>(*GEP.idx_begin()) &&
cast<Constant>(*GEP.idx_begin())->isNullValue() &&
Src->getNumOperands() != 1) {
// Otherwise we can do the fold if the first index of the GEP is a zero
Indices.append(Src->op_begin()+1, Src->op_end());
Indices.append(GEP.idx_begin()+1, GEP.idx_end());
if (!Indices.empty())
return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
? GetElementPtrInst::CreateInBounds(
Src->getSourceElementType(), Src->getOperand(0), Indices,
: GetElementPtrInst::Create(Src->getSourceElementType(),
Src->getOperand(0), Indices,
// Skip if GEP source element type is scalable. The type alloc size is unknown
// at compile-time.
if (GEP.getNumIndices() == 1 && !IsGEPSrcEleScalable) {
unsigned AS = GEP.getPointerAddressSpace();
if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
DL.getIndexSizeInBits(AS)) {
uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
bool Matched = false;
uint64_t C;
Value *V = nullptr;
if (TyAllocSize == 1) {
V = GEP.getOperand(1);
Matched = true;
} else if (match(GEP.getOperand(1),
m_AShr(m_Value(V), m_ConstantInt(C)))) {
if (TyAllocSize == 1ULL << C)
Matched = true;
} else if (match(GEP.getOperand(1),
m_SDiv(m_Value(V), m_ConstantInt(C)))) {
if (TyAllocSize == C)
Matched = true;
if (Matched) {
// Canonicalize (gep i8* X, -(ptrtoint Y))
// to (inttoptr (sub (ptrtoint X), (ptrtoint Y)))
// The GEP pattern is emitted by the SCEV expander for certain kinds of
// pointer arithmetic.
if (match(V, m_Neg(m_PtrToInt(m_Value())))) {
Operator *Index = cast<Operator>(V);
Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType);
// Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
// to (bitcast Y)
Value *Y;
if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType);
// We do not handle pointer-vector geps here.
if (GEPType->isVectorTy())
return nullptr;
// Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
Value *StrippedPtr = PtrOp->stripPointerCasts();
PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
if (StrippedPtr != PtrOp) {
bool HasZeroPointerIndex = false;
Type *StrippedPtrEltTy = StrippedPtrTy->getElementType();
if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
HasZeroPointerIndex = C->isZero();
// Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
// into : GEP [10 x i8]* X, i32 0, ...
// Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
// into : GEP i8* X, ...
// This occurs when the program declares an array extern like "int X[];"
if (HasZeroPointerIndex) {
if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) {
// GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
if (CATy->getElementType() == StrippedPtrEltTy) {
// -> GEP i8* X, ...
SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end());
GetElementPtrInst *Res = GetElementPtrInst::Create(
StrippedPtrEltTy, StrippedPtr, Idx, GEP.getName());
if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace())
return Res;
// Insert Res, and create an addrspacecast.
// e.g.,
// GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ...
// ->
// %0 = GEP i8 addrspace(1)* X, ...
// addrspacecast i8 addrspace(1)* %0 to i8*
return new AddrSpaceCastInst(Builder.Insert(Res), GEPType);
if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrEltTy)) {
// GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
if (CATy->getElementType() == XATy->getElementType()) {
// -> GEP [10 x i8]* X, i32 0, ...
// At this point, we know that the cast source type is a pointer
// to an array of the same type as the destination pointer
// array. Because the array type is never stepped over (there
// is a leading zero) we can fold the cast into this GEP.
if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) {
return replaceOperand(GEP, 0, StrippedPtr);
// Cannot replace the base pointer directly because StrippedPtr's
// address space is different. Instead, create a new GEP followed by
// an addrspacecast.
// e.g.,
// GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*),
// i32 0, ...
// ->
// %0 = GEP [10 x i8] addrspace(1)* X, ...
// addrspacecast i8 addrspace(1)* %0 to i8*
SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end());
Value *NewGEP =
? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
Idx, GEP.getName())
: Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
return new AddrSpaceCastInst(NewGEP, GEPType);
} else if (GEP.getNumOperands() == 2 && !IsGEPSrcEleScalable) {
// Skip if GEP source element type is scalable. The type alloc size is
// unknown at compile-time.
// Transform things like: %t = getelementptr i32*
// bitcast ([2 x i32]* %str to i32*), i32 %V into: %t1 = getelementptr [2
// x i32]* %str, i32 0, i32 %V; bitcast
if (StrippedPtrEltTy->isArrayTy() &&
DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) ==
DL.getTypeAllocSize(GEPEltType)) {
Type *IdxType = DL.getIndexType(GEPType);
Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
Value *NewGEP =
? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, Idx,
: Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
// V and GEP are both pointer types --> BitCast
return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType);
// Transform things like:
// %V = mul i64 %N, 4
// %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
// into: %t1 = getelementptr i32* %arr, i32 %N; bitcast
if (GEPEltType->isSized() && StrippedPtrEltTy->isSized()) {
// Check that changing the type amounts to dividing the index by a scale
// factor.
uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy).getFixedSize();
if (ResSize && SrcSize % ResSize == 0) {
Value *Idx = GEP.getOperand(1);
unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
uint64_t Scale = SrcSize / ResSize;
// Earlier transforms ensure that the index has the right type
// according to Data Layout, which considerably simplifies the
// logic by eliminating implicit casts.
assert(Idx->getType() == DL.getIndexType(GEPType) &&
"Index type does not match the Data Layout preferences");
bool NSW;
if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
// Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
// If the multiplication NewIdx * Scale may overflow then the new
// GEP may not be "inbounds".
Value *NewGEP =
GEP.isInBounds() && NSW
? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
NewIdx, GEP.getName())
: Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx,
// The NewGEP must be pointer typed, so must the old one -> BitCast
return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
// Similarly, transform things like:
// getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
// (where tmp = 8*tmp2) into:
// getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
if (GEPEltType->isSized() && StrippedPtrEltTy->isSized() &&
StrippedPtrEltTy->isArrayTy()) {
// Check that changing to the array element type amounts to dividing the
// index by a scale factor.
uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
uint64_t ArrayEltSize =
if (ResSize && ArrayEltSize % ResSize == 0) {
Value *Idx = GEP.getOperand(1);
unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
uint64_t Scale = ArrayEltSize / ResSize;
// Earlier transforms ensure that the index has the right type
// according to the Data Layout, which considerably simplifies
// the logic by eliminating implicit casts.
assert(Idx->getType() == DL.getIndexType(GEPType) &&
"Index type does not match the Data Layout preferences");
bool NSW;
if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
// Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
// If the multiplication NewIdx * Scale may overflow then the new
// GEP may not be "inbounds".
Type *IndTy = DL.getIndexType(GEPType);
Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx};
Value *NewGEP =
GEP.isInBounds() && NSW
? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
Off, GEP.getName())
: Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off,
// The NewGEP must be pointer typed, so must the old one -> BitCast
return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
// addrspacecast between types is canonicalized as a bitcast, then an
// addrspacecast. To take advantage of the below bitcast + struct GEP, look
// through the addrspacecast.
Value *ASCStrippedPtrOp = PtrOp;
if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
// X = bitcast A addrspace(1)* to B addrspace(1)*
// Y = addrspacecast A addrspace(1)* to B addrspace(2)*
// Z = gep Y, <...constant indices...>
// Into an addrspacecasted GEP of the struct.
if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
ASCStrippedPtrOp = BC;
if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
Value *SrcOp = BCI->getOperand(0);
PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
Type *SrcEltType = SrcType->getElementType();
// GEP directly using the source operand if this GEP is accessing an element
// of a bitcasted pointer to vector or array of the same dimensions:
// gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
// gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
const DataLayout &DL) {
auto *VecVTy = cast<VectorType>(VecTy);
return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
if (GEP.getNumOperands() == 3 &&
((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() &&
areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
(GEPEltType->isVectorTy() && SrcEltType->isArrayTy() &&
areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
// Create a new GEP here, as using `setOperand()` followed by
// `setSourceElementType()` won't actually update the type of the
// existing GEP Value. Causing issues if this Value is accessed when
// constructing an AddrSpaceCastInst
Value *NGEP =
? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]})
: Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]});
// Preserve GEP address space to satisfy users
if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
return new AddrSpaceCastInst(NGEP, GEPType);
return replaceInstUsesWith(GEP, NGEP);
// See if we can simplify:
// X = bitcast A* to B*
// Y = gep X, <...constant indices...>
// into a gep of the original struct. This is important for SROA and alias
// analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
APInt Offset(OffsetBits, 0);
if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) {
// If this GEP instruction doesn't move the pointer, just replace the GEP
// with a bitcast of the real input to the dest type.
if (!Offset) {
// If the bitcast is of an allocation, and the allocation will be
// converted to match the type of the cast, don't touch this.
if (isa<AllocaInst>(SrcOp) || isAllocationFn(SrcOp, &TLI)) {
// See if the bitcast simplifies, if so, don't nuke this GEP yet.
if (Instruction *I = visitBitCast(*BCI)) {
if (I != BCI) {
BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
replaceInstUsesWith(*BCI, I);
return &GEP;
if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
return new AddrSpaceCastInst(SrcOp, GEPType);
return new BitCastInst(SrcOp, GEPType);
// Otherwise, if the offset is non-zero, we need to find out if there is a
// field at Offset in 'A's type. If so, we can pull the cast through the
// GEP.
SmallVector<Value*, 8> NewIndices;
if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
Value *NGEP =
? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
: Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
if (NGEP->getType() == GEPType)
return replaceInstUsesWith(GEP, NGEP);
if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
return new AddrSpaceCastInst(NGEP, GEPType);
return new BitCastInst(NGEP, GEPType);
if (!GEP.isInBounds()) {
unsigned IdxWidth =
APInt BasePtrOffset(IdxWidth, 0);
Value *UnderlyingPtrOp =
if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
BasePtrOffset.isNonNegative()) {
APInt AllocSize(
if (BasePtrOffset.ule(AllocSize)) {
return GetElementPtrInst::CreateInBounds(
GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1),
if (Instruction *R = foldSelectGEP(GEP, Builder))
return R;
return nullptr;
static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
Instruction *AI) {
if (isa<ConstantPointerNull>(V))
return true;
if (auto *LI = dyn_cast<LoadInst>(V))
return isa<GlobalVariable>(LI->getPointerOperand());
// Two distinct allocations will never be equal.
// We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
// through bitcasts of V can cause
// the result statement below to be true, even when AI and V (ex:
// i8* ->i32* ->i8* of AI) are the same allocations.
return isAllocLikeFn(V, TLI) && V != AI;
static bool isAllocSiteRemovable(Instruction *AI,
SmallVectorImpl<WeakTrackingVH> &Users,
const TargetLibraryInfo *TLI) {
SmallVector<Instruction*, 4> Worklist;
do {
Instruction *PI = Worklist.pop_back_val();
for (User *U : PI->users()) {
Instruction *I = cast<Instruction>(U);
switch (I->getOpcode()) {
// Give up the moment we see something we can't handle.
return false;
case Instruction::AddrSpaceCast:
case Instruction::BitCast:
case Instruction::GetElementPtr:
case Instruction::ICmp: {
ICmpInst *ICI = cast<ICmpInst>(I);
// We can fold eq/ne comparisons with null to false/true, respectively.
// We also fold comparisons in some conditions provided the alloc has
// not escaped (see isNeverEqualToUnescapedAlloc).
if (!ICI->isEquality())
return false;
unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
return false;
case Instruction::Call:
// Ignore no-op and store intrinsics.
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
switch (II->getIntrinsicID()) {
return false;
case Intrinsic::memmove:
case Intrinsic::memcpy:
case Intrinsic::memset: {
MemIntrinsic *MI = cast<MemIntrinsic>(II);
if (MI->isVolatile() || MI->getRawDest() != PI)
return false;
case Intrinsic::assume:
case Intrinsic::invariant_start:
case Intrinsic::invariant_end:
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
case Intrinsic::objectsize:
if (isFreeCall(I, TLI)) {
return false;
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(I);
if (SI->isVolatile() || SI->getPointerOperand() != PI)
return false;
llvm_unreachable("missing a return?");
} while (!Worklist.empty());
return true;
Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
// If we have a malloc call which is only used in any amount of comparisons to
// null and free calls, delete the calls and replace the comparisons with true
// or false as appropriate.
// This is based on the principle that we can substitute our own allocation
// function (which will never return null) rather than knowledge of the
// specific function being called. In some sense this can change the permitted
// outputs of a program (when we convert a malloc to an alloca, the fact that
// the allocation is now on the stack is potentially visible, for example),
// but we believe in a permissible manner.
SmallVector<WeakTrackingVH, 64> Users;
// If we are removing an alloca with a dbg.declare, insert dbg.value calls
// before each store.
TinyPtrVector<DbgVariableIntrinsic *> DIIs;
std::unique_ptr<DIBuilder> DIB;
if (isa<AllocaInst>(MI)) {
DIIs = FindDbgAddrUses(&MI);
DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
if (isAllocSiteRemovable(&MI, Users, &TLI)) {
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
// Lowering all @llvm.objectsize calls first because they may
// use a bitcast/GEP of the alloca we are removing.
if (!Users[i])
Instruction *I = cast<Instruction>(&*Users[i]);
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
if (II->getIntrinsicID() == Intrinsic::objectsize) {
Value *Result =
lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/true);
replaceInstUsesWith(*I, Result);
Users[i] = nullptr; // Skip examining in the next loop.
for (unsigned i = 0, e = Users.size(); i != e; ++i) {
if (!Users[i])
Instruction *I = cast<Instruction>(&*Users[i]);
if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
} else if (auto *SI = dyn_cast<StoreInst>(I)) {
for (auto *DII : DIIs)
ConvertDebugDeclareToDebugValue(DII, SI, *DIB);
} else {
// Casts, GEP, or anything else: we're about to delete this instruction,
// so it can not have any valid uses.
replaceInstUsesWith(*I, UndefValue::get(I->getType()));
if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
// Replace invoke with a NOP intrinsic to maintain the original CFG
Module *M = II->getModule();
Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
None, "", II->getParent());
for (auto *DII : DIIs)
return eraseInstFromFunction(MI);
return nullptr;
/// Move the call to free before a NULL test.
/// Check if this free is accessed after its argument has been test
/// against NULL (property 0).
/// If yes, it is legal to move this call in its predecessor block.
/// The move is performed only if the block containing the call to free
/// will be removed, i.e.:
/// 1. it has only one predecessor P, and P has two successors
/// 2. it contains the call, noops, and an unconditional branch
/// 3. its successor is the same as its predecessor's successor
/// The profitability is out-of concern here and this function should
/// be called only if the caller knows this transformation would be
/// profitable (e.g., for code size).
static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
const DataLayout &DL) {
Value *Op = FI.getArgOperand(0);
BasicBlock *FreeInstrBB = FI.getParent();
BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
// Validate part of constraint #1: Only one predecessor
// FIXME: We can extend the number of predecessor, but in that case, we
// would duplicate the call to free in each predecessor and it may
// not be profitable even for code size.
if (!PredBB)
return nullptr;
// Validate constraint #2: Does this block contains only the call to
// free, noops, and an unconditional branch?
BasicBlock *SuccBB;
Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator();
if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB)))
return nullptr;
// If there are only 2 instructions in the block, at this point,
// this is the call to free and unconditional.
// If there are more than 2 instructions, check that they are noops
// i.e., they won't hurt the performance of the generated code.
if (FreeInstrBB->size() != 2) {
for (const Instruction &Inst : FreeInstrBB->instructionsWithoutDebug()) {
if (&Inst == &FI || &Inst == FreeInstrBBTerminator)
auto *Cast = dyn_cast<CastInst>(&Inst);
if (!Cast || !Cast->isNoopCast(DL))
return nullptr;
// Validate the rest of constraint #1 by matching on the pred branch.
Instruction *TI = PredBB->getTerminator();
BasicBlock *TrueBB, *FalseBB;
ICmpInst::Predicate Pred;
if (!match(TI, m_Br(m_ICmp(Pred,
TrueBB, FalseBB)))
return nullptr;
if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
return nullptr;
// Validate constraint #3: Ensure the null case just falls through.
if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
return nullptr;
assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
"Broken CFG: missing edge from predecessor to successor");
// At this point, we know that everything in FreeInstrBB can be moved
// before TI.
for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end();
It != End;) {
Instruction &Instr = *It++;
if (&Instr == FreeInstrBBTerminator)
assert(FreeInstrBB->size() == 1 &&
"Only the branch instruction should remain");
return &FI;
Instruction *InstCombiner::visitFree(CallInst &FI) {
Value *Op = FI.getArgOperand(0);
// free undef -> unreachable.
if (isa<UndefValue>(Op)) {
// Leave a marker since we can't modify the CFG here.
return eraseInstFromFunction(FI);
// If we have 'free null' delete the instruction. This can happen in stl code
// when lots of inlining happens.
if (isa<ConstantPointerNull>(Op))
return eraseInstFromFunction(FI);
// If we optimize for code size, try to move the call to free before the null
// test so that simplify cfg can remove the empty block and dead code
// elimination the branch. I.e., helps to turn something like:
// if (foo) free(foo);
// into
// free(foo);
// Note that we can only do this for 'free' and not for any flavor of
// 'operator delete'; there is no 'operator delete' symbol for which we are
// permitted to invent a call, even if we're passing in a null pointer.
if (MinimizeSize) {
LibFunc Func;
if (TLI.getLibFunc(FI, Func) && TLI.has(Func) && Func == LibFunc_free)
if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
return I;
return nullptr;
static bool isMustTailCall(Value *V) {
if (auto *CI = dyn_cast<CallInst>(V))
return CI->isMustTailCall();
return false;
Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
if (RI.getNumOperands() == 0) // ret void
return nullptr;
Value *ResultOp = RI.getOperand(0);
Type *VTy = ResultOp->getType();
if (!VTy->isIntegerTy() || isa<Constant>(ResultOp))
return nullptr;
// Don't replace result of musttail calls.
if (isMustTailCall(ResultOp))
return nullptr;
// There might be assume intrinsics dominating this return that completely
// determine the value. If so, constant fold it.
KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
if (Known.isConstant())
return replaceOperand(RI, 0,
Constant::getIntegerValue(VTy, Known.getConstant()));
return nullptr;
Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) {
assert(BI.isUnconditional() && "Only for unconditional branches.");
// If this store is the second-to-last instruction in the basic block
// (excluding debug info and bitcasts of pointers) and if the block ends with
// an unconditional branch, try to move the store to the successor block.
auto GetLastSinkableStore = [](BasicBlock::iterator BBI) {
auto IsNoopInstrForStoreMerging = [](BasicBlock::iterator BBI) {
return isa<DbgInfoIntrinsic>(BBI) ||
(isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy());
BasicBlock::iterator FirstInstr = BBI->getParent()->begin();
do {
if (BBI != FirstInstr)
} while (BBI != FirstInstr && IsNoopInstrForStoreMerging(BBI));
return dyn_cast<StoreInst>(BBI);
if (StoreInst *SI = GetLastSinkableStore(BasicBlock::iterator(BI)))
if (mergeStoreIntoSuccessor(*SI))
return &BI;
return nullptr;
Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
if (BI.isUnconditional())
return visitUnconditionalBranchInst(BI);
// Change br (not X), label True, label False to: br X, label False, True
Value *X = nullptr;
if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) &&
!isa<Constant>(X)) {
// Swap Destinations and condition...
return replaceOperand(BI, 0, X);
// If the condition is irrelevant, remove the use so that other
// transforms on the condition become more effective.
if (!isa<ConstantInt>(BI.getCondition()) &&
BI.getSuccessor(0) == BI.getSuccessor(1))
return replaceOperand(
BI, 0, ConstantInt::getFalse(BI.getCondition()->getType()));
// Canonicalize, for example, fcmp_one -> fcmp_oeq.
CmpInst::Predicate Pred;
if (match(&BI, m_Br(m_OneUse(m_FCmp(Pred, m_Value(), m_Value())),
m_BasicBlock(), m_BasicBlock())) &&
!isCanonicalPredicate(Pred)) {
// Swap destinations and condition.
CmpInst *Cond = cast<CmpInst>(BI.getCondition());
return &BI;
return nullptr;
Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
Value *Cond = SI.getCondition();
Value *Op0;
ConstantInt *AddRHS;
if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
// Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
for (auto Case : SI.cases()) {
Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
assert(isa<ConstantInt>(NewCase) &&
"Result of expression should be constant");
return replaceOperand(SI, 0, Op0);
KnownBits Known = computeKnownBits(Cond, 0, &SI);
unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
// Compute the number of leading bits we can ignore.
// TODO: A better way to determine this would use ComputeNumSignBits().
for (auto &C : SI.cases()) {
LeadingKnownZeros = std::min(
LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
LeadingKnownOnes = std::min(
LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
// Shrink the condition operand if the new type is smaller than the old type.
// But do not shrink to a non-standard type, because backend can't generate
// good code for that yet.
// TODO: We can make it aggressive again after fixing PR39569.
if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
shouldChangeType(Known.getBitWidth(), NewWidth)) {
IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
for (auto Case : SI.cases()) {
APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
return replaceOperand(SI, 0, NewCond);
return nullptr;
Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
Value *Agg = EV.getAggregateOperand();
if (!EV.hasIndices())
return replaceInstUsesWith(EV, Agg);
if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(),
return replaceInstUsesWith(EV, V);
if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
// We're extracting from an insertvalue instruction, compare the indices
const unsigned *exti, *exte, *insi, *inse;
for (exti = EV.idx_begin(), insi = IV->idx_begin(),
exte = EV.idx_end(), inse = IV->idx_end();
exti != exte && insi != inse;
++exti, ++insi) {
if (*insi != *exti)
// The insert and extract both reference distinctly different elements.
// This means the extract is not influenced by the insert, and we can
// replace the aggregate operand of the extract with the aggregate
// operand of the insert. i.e., replace
// %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
// %E = extractvalue { i32, { i32 } } %I, 0
// with
// %E = extractvalue { i32, { i32 } } %A, 0
return ExtractValueInst::Create(IV->getAggregateOperand(),
if (exti == exte && insi == inse)
// Both iterators are at the end: Index lists are identical. Replace
// %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
// %C = extractvalue { i32, { i32 } } %B, 1, 0
// with "i32 42"
return replaceInstUsesWith(EV, IV->getInsertedValueOperand());
if (exti == exte) {
// The extract list is a prefix of the insert list. i.e. replace
// %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
// %E = extractvalue { i32, { i32 } } %I, 1
// with
// %X = extractvalue { i32, { i32 } } %A, 1
// %E = insertvalue { i32 } %X, i32 42, 0
// by switching the order of the insert and extract (though the
// insertvalue should be left in, since it may have other uses).
Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(),
return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
makeArrayRef(insi, inse));
if (insi == inse)
// The insert list is a prefix of the extract list
// We can simply remove the common indices from the extract and make it
// operate on the inserted value instead of the insertvalue result.
// i.e., replace
// %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
// %E = extractvalue { i32, { i32 } } %I, 1, 0
// with
// %E extractvalue { i32 } { i32 42 }, 0
return ExtractValueInst::Create(IV->getInsertedValueOperand(),
makeArrayRef(exti, exte));
if (WithOverflowInst *WO = dyn_cast<WithOverflowInst>(Agg)) {
// We're extracting from an overflow intrinsic, see if we're the only user,
// which allows us to simplify multiple result intrinsics to simpler
// things that just get one value.
if (WO->hasOneUse()) {
// Check if we're grabbing only the result of a 'with overflow' intrinsic
// and replace it with a traditional binary instruction.
if (*EV.idx_begin() == 0) {
Instruction::BinaryOps BinOp = WO->getBinaryOp();
Value *LHS = WO->getLHS(), *RHS = WO->getRHS();
replaceInstUsesWith(*WO, UndefValue::get(WO->getType()));
return BinaryOperator::Create(BinOp, LHS, RHS);
// If the normal result of the add is dead, and the RHS is a constant,
// we can transform this into a range comparison.
// overflow = uadd a, -4 --> overflow = icmp ugt a, 3
if (WO->getIntrinsicID() == Intrinsic::uadd_with_overflow)
if (ConstantInt *CI = dyn_cast<ConstantInt>(WO->getRHS()))
return new ICmpInst(ICmpInst::ICMP_UGT, WO->getLHS(),
if (LoadInst *L = dyn_cast<LoadInst>(Agg))
// If the (non-volatile) load only has one use, we can rewrite this to a
// load from a GEP. This reduces the size of the load. If a load is used
// only by extractvalue instructions then this either must have been
// optimized before, or it is a struct with padding, in which case we
// don't want to do the transformation as it loses padding knowledge.
if (L->isSimple() && L->hasOneUse()) {
// extractvalue has integer indices, getelementptr has Value*s. Convert.
SmallVector<Value*, 4> Indices;
// Prefix an i32 0 since we need the first element.
for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end();
I != E; ++I)
// We need to insert these at the location of the old load, not at that of
// the extractvalue.
Value *GEP = Builder.CreateInBoundsGEP(L->getType(),
L->getPointerOperand(), Indices);
Instruction *NL = Builder.CreateLoad(EV.getType(), GEP);
// Whatever aliasing information we had for the orignal load must also
// hold for the smaller load, so propagate the annotations.
AAMDNodes Nodes;
// Returning the load directly will cause the main loop to insert it in
// the wrong spot, so use replaceInstUsesWith().
return replaceInstUsesWith(EV, NL);
// We could simplify extracts from other values. Note that nested extracts may
// already be simplified implicitly by the above: extract (extract (insert) )
// will be translated into extract ( insert ( extract ) ) first and then just
// the value inserted, if appropriate. Similarly for extracts from single-use
// loads: extract (extract (load)) will be translated to extract (load (gep))
// and if again single-use then via load (gep (gep)) to load (gep).
// However, double extracts from e.g. function arguments or return values
// aren't handled yet.
return nullptr;
/// Return 'true' if the given typeinfo will match anything.
static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
switch (Personality) {
case EHPersonality::GNU_C:
case EHPersonality::GNU_C_SjLj:
case EHPersonality::Rust:
// The GCC C EH and Rust personality only exists to support cleanups, so
// it's not clear what the semantics of catch clauses are.
return false;
case EHPersonality::Unknown:
return false;
case EHPersonality::GNU_Ada:
// While __gnat_all_others_value will match any Ada exception, it doesn't
// match foreign exceptions (or didn't, before gcc-4.7).
return false;
case EHPersonality::GNU_CXX:
case EHPersonality::GNU_CXX_SjLj:
case EHPersonality::GNU_ObjC:
case EHPersonality::MSVC_X86SEH:
case EHPersonality::MSVC_Win64SEH:
case EHPersonality::MSVC_CXX:
case EHPersonality::CoreCLR:
case EHPersonality::Wasm_CXX:
return TypeInfo->isNullValue();
llvm_unreachable("invalid enum");
static bool shorter_filter(const Value *LHS, const Value *RHS) {
Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
// The logic here should be correct for any real-world personality function.
// However if that turns out not to be true, the offending logic can always
// be conditioned on the personality function, like the catch-all logic is.
EHPersonality Personality =
// Simplify the list of clauses, eg by removing repeated catch clauses
// (these are often created by inlining).
bool MakeNewInstruction = false; // If true, recreate using the following:
SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction;
bool CleanupFlag = LI.isCleanup(); // - The new instruction is a cleanup.
SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
bool isLastClause = i + 1 == e;
if (LI.isCatch(i)) {
// A catch clause.
Constant *CatchClause = LI.getClause(i);
Constant *TypeInfo = CatchClause->stripPointerCasts();
// If we already saw this clause, there is no point in having a second
// copy of it.
if (AlreadyCaught.insert(TypeInfo).second) {
// This catch clause was not already seen.
} else {
// Repeated catch clause - drop the redundant copy.
MakeNewInstruction = true;
// If this is a catch-all then there is no point in keeping any following
// clauses or marking the landingpad as having a cleanup.
if (isCatchAll(Personality, TypeInfo)) {
if (!isLastClause)
MakeNewInstruction = true;
CleanupFlag = false;
} else {
// A filter clause. If any of the filter elements were already caught
// then they can be dropped from the filter. It is tempting to try to
// exploit the filter further by saying that any typeinfo that does not
// occur in the filter can't be caught later (and thus can be dropped).
// However this would be wrong, since typeinfos can match without being
// equal (for example if one represents a C++ class, and the other some
// class derived from it).
assert(LI.isFilter(i) && "Unsupported landingpad clause!");
Constant *FilterClause = LI.getClause(i);
ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
unsigned NumTypeInfos = FilterType->getNumElements();
// An empty filter catches everything, so there is no point in keeping any
// following clauses or marking the landingpad as having a cleanup. By
// dealing with this case here the following code is made a bit simpler.
if (!NumTypeInfos) {
if (!isLastClause)
MakeNewInstruction = true;
CleanupFlag = false;
bool MakeNewFilter = false; // If true, make a new filter.
SmallVector<Constant *, 16> NewFilterElts; // New elements.
if (isa<ConstantAggregateZero>(FilterClause)) {
// Not an empty filter - it contains at least one null typeinfo.
assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
Constant *TypeInfo =
// If this typeinfo is a catch-all then the filter can never match.
if (isCatchAll(Personality, TypeInfo)) {
// Throw the filter away.
MakeNewInstruction = true;
// There is no point in having multiple copies of this typeinfo, so
// discard all but the first copy if there is more than one.
if (NumTypeInfos > 1)
MakeNewFilter = true;
} else {
ConstantArray *Filter = cast<ConstantArray>(FilterClause);
SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
// Remove any filter elements that were already caught or that already
// occurred in the filter. While there, see if any of the elements are
// catch-alls. If so, the filter can be discarded.
bool SawCatchAll = false;
for (unsigned j = 0; j != NumTypeInfos; ++j) {
Constant *Elt = Filter->getOperand(j);
Constant *TypeInfo = Elt->stripPointerCasts();
if (isCatchAll(Personality, TypeInfo)) {
// This element is a catch-all. Bail out, noting this fact.
SawCatchAll = true;
// Even if we've seen a type in a catch clause, we don't want to
// remove it from the filter. An unexpected type handler may be
// set up for a call site which throws an exception of the same
// type caught. In order for the exception thrown by the unexpected
// handler to propagate correctly, the filter must be correctly
// described for the call site.
// Example:
// void unexpected() { throw 1;}
// void foo() throw (int) {
// std::set_unexpected(unexpected);
// try {
// throw 2.0;
// } catch (int i) {}
// }
// There is no point in having multiple copies of the same typeinfo in
// a filter, so only add it if we didn't already.
if (SeenInFilter.insert(TypeInfo).second)
// A filter containing a catch-all cannot match anything by definition.
if (SawCatchAll) {
// Throw the filter away.
MakeNewInstruction = true;
// If we dropped something from the filter, make a new one.
if (NewFilterElts.size() < NumTypeInfos)
MakeNewFilter = true;
if (MakeNewFilter) {
FilterType = ArrayType::get(FilterType->getElementType(),
FilterClause = ConstantArray::get(FilterType, NewFilterElts);
MakeNewInstruction = true;
// If the new filter is empty then it will catch everything so there is
// no point in keeping any following clauses or marking the landingpad
// as having a cleanup. The case of the original filter being empty was
// already handled above.
if (MakeNewFilter && !NewFilterElts.size()) {
assert(MakeNewInstruction && "New filter but not a new instruction!");
CleanupFlag = false;
// If several filters occur in a row then reorder them so that the shortest
// filters come first (those with the smallest number of elements). This is
// advantageous because shorter filters are more likely to match, speeding up
// unwinding, but mostly because it increases the effectiveness of the other
// filter optimizations below.
for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
unsigned j;
// Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
for (j = i; j != e; ++j)
if (!isa<ArrayType>(NewClauses[j]->getType()))
// Check whether the filters are already sorted by length. We need to know
// if sorting them is actually going to do anything so that we only make a
// new landingpad instruction if it does.
for (unsigned k = i; k + 1 < j; ++k)
if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
// Not sorted, so sort the filters now. Doing an unstable sort would be
// correct too but reordering filters pointlessly might confuse users.
std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
MakeNewInstruction = true;
// Look for the next batch of filters.
i = j + 1;
// If typeinfos matched if and only if equal, then the elements of a filter L
// that occurs later than a filter F could be replaced by the intersection of
// the elements of F and L. In reality two typeinfos can match without being
// equal (for example if one represents a C++ class, and the other some class
// derived from it) so it would be wrong to perform this transform in general.
// However the transform is correct and useful if F is a subset of L. In that
// case L can be replaced by F, and thus removed altogether since repeating a
// filter is pointless. So here we look at all pairs of filters F and L where
// L follows F in the list of clauses, and remove L if every element of F is
// an element of L. This can occur when inlining C++ functions with exception
// specifications.
for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
// Examine each filter in turn.
Value *Filter = NewClauses[i];
ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
if (!FTy)
// Not a filter - skip it.
unsigned FElts = FTy->getNumElements();
// Examine each filter following this one. Doing this backwards means that
// we don't have to worry about filters disappearing under us when removed.
for (unsigned j = NewClauses.size() - 1; j != i; --j) {
Value *LFilter = NewClauses[j];
ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
if (!LTy)
// Not a filter - skip it.
// If Filter is a subset of LFilter, i.e. every element of Filter is also
// an element of LFilter, then discard LFilter.
SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j;
// If Filter is empty then it is a subset of LFilter.
if (!FElts) {
// Discard LFilter.
MakeNewInstruction = true;
// Move on to the next filter.
unsigned LElts = LTy->getNumElements();
// If Filter is longer than LFilter then it cannot be a subset of it.
if (FElts > LElts)
// Move on to the next filter.
// At this point we know that LFilter has at least one element.
if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
// Filter is a subset of LFilter iff Filter contains only zeros (as we
// already know that Filter is not longer than LFilter).
if (isa<ConstantAggregateZero>(Filter)) {
assert(FElts <= LElts && "Should have handled this case earlier!");
// Discard LFilter.
MakeNewInstruction = true;
// Move on to the next filter.
ConstantArray *LArray = cast<ConstantArray>(LFilter);
if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
// Since Filter is non-empty and contains only zeros, it is a subset of
// LFilter iff LFilter contains a zero.
assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
for (unsigned l = 0; l != LElts; ++l)
if (LArray->getOperand(l)->isNullValue()) {
// LFilter contains a zero - discard it.
MakeNewInstruction = true;
// Move on to the next filter.
// At this point we know that both filters are ConstantArrays. Loop over
// operands to see whether every element of Filter is also an element of
// LFilter. Since filters tend to be short this is probably faster than
// using a method that scales nicely.
ConstantArray *FArray = cast<ConstantArray>(Filter);
bool AllFound = true;
for (unsigned f = 0; f != FElts; ++f) {
Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
AllFound = false;
for (unsigned l = 0; l != LElts; ++l) {
Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
if (LTypeInfo == FTypeInfo) {
AllFound = true;
if (!AllFound)
if (AllFound) {
// Discard LFilter.
MakeNewInstruction = true;
// Move on to the next filter.
// If we changed any of the clauses, replace the old landingpad instruction
// with a new one.
if (MakeNewInstruction) {
LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
// A landing pad with no clauses must have the cleanup flag set. It is
// theoretically possible, though highly unlikely, that we eliminated all
// clauses. If so, force the cleanup flag to true.
if (NewClauses.empty())
CleanupFlag = true;
return NLI;
// Even if none of the clauses changed, we may nonetheless have understood
// that the cleanup flag is pointless. Clear it if so.
if (LI.isCleanup() != CleanupFlag) {
assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
return &LI;
return nullptr;
Instruction *InstCombiner::visitFreeze(FreezeInst &I) {
Value *Op0 = I.getOperand(0);
if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);
return nullptr;
/// Try to move the specified instruction from its current block into the
/// beginning of DestBlock, which can only happen if it's safe to move the
/// instruction past all of the instructions between it and the end of its
/// block.
static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
assert(I->getSingleUndroppableUse() && "Invariants didn't hold!");
BasicBlock *SrcBlock = I->getParent();
// Cannot move control-flow-involving, volatile loads, vaarg, etc.
if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
return false;
// Do not sink static or dynamic alloca instructions. Static allocas must
// remain in the entry block, and dynamic allocas must not be sunk in between
// a stacksave / stackrestore pair, which would incorrectly shorten its
// lifetime.
if (isa<AllocaInst>(I))
return false;
// Do not sink into catchswitch blocks.
if (isa<CatchSwitchInst>(DestBlock->getTerminator()))
return false;
// Do not sink convergent call instructions.
if (auto *CI = dyn_cast<CallInst>(I)) {
if (CI->isConvergent())
return false;
// We can only sink load instructions if there is nothing between the load and
// the end of block that could change the value.
if (I->mayReadFromMemory()) {
// We don't want to do any sophisticated alias analysis, so we only check
// the instructions after I in I's parent block if we try to sink to its
// successor block.
if (DestBlock->getUniquePredecessor() != I->getParent())
return false;
for (BasicBlock::iterator Scan = I->getIterator(),
E = I->getParent()->end();
Scan != E; ++Scan)
if (Scan->mayWriteToMemory())
return false;
I->dropDroppableUses([DestBlock](const Use *U) {
if (auto *I = dyn_cast<Instruction>(U->getUser()))
return I->getParent() != DestBlock;
return true;
/// FIXME: We could remove droppable uses that are not dominated by
/// the new position.
BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
// Also sink all related debug uses from the source basic block. Otherwise we
// get debug use before the def. Attempt to salvage debug uses first, to
// maximise the range variables have location for. If we cannot salvage, then
// mark the location undef: we know it was supposed to receive a new location
// here, but that computation has been sunk.
SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
findDbgUsers(DbgUsers, I);
// Update the arguments of a dbg.declare instruction, so that it
// does not point into a sunk instruction.
auto updateDbgDeclare = [&I](DbgVariableIntrinsic *DII) {
if (!isa<DbgDeclareInst>(DII))
return false;
if (isa<CastInst>(I))
0, MetadataAsValue::get(I->getContext(),
return true;
SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
for (auto User : DbgUsers) {
// A dbg.declare instruction should not be cloned, since there can only be
// one per variable fragment. It should be left in the original place
// because the sunk instruction is not an alloca (otherwise we could not be
// here).
if (User->getParent() != SrcBlock || updateDbgDeclare(User))
LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
// Perform salvaging without the clones, then sink the clones.
if (!DIIClones.empty()) {
salvageDebugInfoForDbgValues(*I, DbgUsers);
for (auto &DIIClone : DIIClones) {
LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
return true;
bool InstCombiner::run() {
while (!Worklist.isEmpty()) {
// Walk deferred instructions in reverse order, and push them to the
// worklist, which means they'll end up popped from the worklist in-order.
while (Instruction *I = Worklist.popDeferred()) {
// Check to see if we can DCE the instruction. We do this already here to
// reduce the number of uses and thus allow other folds to trigger.
// Note that eraseInstFromFunction() may push additional instructions on
// the deferred worklist, so this will DCE whole instruction chains.
if (isInstructionTriviallyDead(I, &TLI)) {
Instruction *I = Worklist.removeOne();
if (I == nullptr) continue; // skip null values.
// Check to see if we can DCE the instruction.
if (isInstructionTriviallyDead(I, &TLI)) {
if (!DebugCounter::shouldExecute(VisitCounter))
// Instruction isn't dead, see if we can constant propagate it.
if (!I->use_empty() &&
(I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) {
if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I
<< '\n');
// Add operands to the worklist.
replaceInstUsesWith(*I, C);
if (isInstructionTriviallyDead(I, &TLI))
MadeIRChange = true;
// See if we can trivially sink this instruction to its user if we can
// prove that the successor is not executed more frequently than our block.
if (EnableCodeSinking)
if (Use *SingleUse = I->getSingleUndroppableUse()) {
BasicBlock *BB = I->getParent();
Instruction *UserInst = cast<Instruction>(SingleUse->getUser());
BasicBlock *UserParent;
// Get the block the use occurs in.
if (PHINode *PN = dyn_cast<PHINode>(UserInst))
UserParent = PN->getIncomingBlock(*SingleUse);
UserParent = UserInst->getParent();
if (UserParent != BB) {
// See if the user is one of our successors that has only one
// predecessor, so that we don't have to split the critical edge.
bool ShouldSink = UserParent->getUniquePredecessor() == BB;
// Another option where we can sink is a block that ends with a
// terminator that does not pass control to other block (such as
// return or unreachable). In this case:
// - I dominates the User (by SSA form);
// - the User will be executed at most once.
// So sinking I down to User is always profitable or neutral.
if (!ShouldSink) {
auto *Term = UserParent->getTerminator();
ShouldSink = isa<ReturnInst>(Term) || isa<UnreachableInst>(Term);
if (ShouldSink) {
assert(DT.dominates(BB, UserParent) &&
"Dominance relation broken?");
// Okay, the CFG is simple enough, try to sink this instruction.
if (TryToSinkInstruction(I, UserParent)) {
LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
MadeIRChange = true;
// We'll add uses of the sunk instruction below, but since sinking
// can expose opportunities for it's *operands* add them to the
// worklist
for (Use &U : I->operands())
if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
// Now that we have an instruction, try combining it to simplify it.
#ifndef NDEBUG
std::string OrigI;
LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
if (Instruction *Result = visit(*I)) {
// Should we replace the old instruction with a new one?
if (Result != I) {
LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n'
<< " New = " << *Result << '\n');
if (I->getDebugLoc())
// Everything uses the new instruction now.
// Move the name to the new instruction first.
// Insert the new instruction into the basic block...
BasicBlock *InstParent = I->getParent();
BasicBlock::iterator InsertPos = I->getIterator();
// If we replace a PHI with something that isn't a PHI, fix up the
// insertion point.
if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos))
InsertPos = InstParent->getFirstInsertionPt();
InstParent->getInstList().insert(InsertPos, Result);
// Push the new instruction and any users onto the worklist.
} else {
LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
<< " New = " << *I << '\n');
// If the instruction was modified, it's possible that it is now dead.
// if so, remove it.
if (isInstructionTriviallyDead(I, &TLI)) {
} else {
MadeIRChange = true;
return MadeIRChange;
/// Populate the IC worklist from a function, by walking it in depth-first
/// order and adding all reachable code to the worklist.
/// This has a couple of tricks to make the code faster and more powerful. In
/// particular, we constant fold and DCE instructions as we go, to avoid adding
/// them to the worklist (this significantly speeds up instcombine on code where
/// many instructions are dead or constant). Additionally, if we find a branch
/// whose condition is a known constant, we only visit the reachable successors.
static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
const TargetLibraryInfo *TLI,
InstCombineWorklist &ICWorklist) {
bool MadeIRChange = false;
SmallPtrSet<BasicBlock *, 32> Visited;
SmallVector<BasicBlock*, 256> Worklist;
SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
DenseMap<Constant *, Constant *> FoldedConstants;
do {
BasicBlock *BB = Worklist.pop_back_val();
// We have now visited this block! If we've already been here, ignore it.
if (!Visited.insert(BB).second)
for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
Instruction *Inst = &*BBI++;
// ConstantProp instruction if trivially constant.
if (!Inst->use_empty() &&
(Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst
<< '\n');
if (isInstructionTriviallyDead(Inst, TLI))
MadeIRChange = true;
// See if we can constant fold its operands.
for (Use &U : Inst->operands()) {
if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
auto *C = cast<Constant>(U);
Constant *&FoldRes = FoldedConstants[C];
if (!FoldRes)
FoldRes = ConstantFoldConstant(C, DL, TLI);
if (FoldRes != C) {
LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
<< "\n Old = " << *C
<< "\n New = " << *FoldRes << '\n');
U = FoldRes;
MadeIRChange = true;
// Skip processing debug intrinsics in InstCombine. Processing these call instructions
// consumes non-trivial amount of time and provides no value for the optimization.
if (!isa<DbgInfoIntrinsic>(Inst))
// Recursively visit successors. If this is a branch or switch on a
// constant, only visit the reachable successor.
Instruction *TI = BB->getTerminator();
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
for (BasicBlock *SuccBB : successors(TI))
} while (!Worklist.empty());
// Remove instructions inside unreachable blocks. This prevents the
// instcombine code from having to deal with some bad special cases, and
// reduces use counts of instructions.
for (BasicBlock &BB : F) {
if (Visited.count(&BB))
unsigned NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB);
MadeIRChange |= NumDeadInstInBB > 0;
NumDeadInst += NumDeadInstInBB;
// Once we've found all of the instructions to add to instcombine's worklist,
// add them in reverse order. This way instcombine will visit from the top
// of the function down. This jives well with the way that it adds all uses
// of instructions to the worklist after doing a transformation, thus avoiding
// some N^2 behavior in pathological cases.
for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) {
// DCE instruction if trivially dead. As we iterate in reverse program
// order here, we will clean up whole chains of dead instructions.
if (isInstructionTriviallyDead(Inst, TLI)) {
LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
MadeIRChange = true;
return MadeIRChange;
static bool combineInstructionsOverFunction(
Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
auto &DL = F.getParent()->getDataLayout();
MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
/// Builder - This is an IRBuilder that automatically inserts new
/// instructions into the worklist when they are created.
IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder(
F.getContext(), TargetFolder(DL),
IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) {
if (match(I, m_Intrinsic<Intrinsic::assume>()))
// Lower dbg.declare intrinsics otherwise their value may be clobbered
// by instcombiner.
bool MadeIRChange = false;
if (ShouldLowerDbgDeclare)
MadeIRChange = LowerDbgDeclare(F);
// Iterate while there is work to do.
unsigned Iteration = 0;
while (true) {
if (Iteration > InfiniteLoopDetectionThreshold) {
"Instruction Combining seems stuck in an infinite loop after " +
Twine(InfiniteLoopDetectionThreshold) + " iterations.");
if (Iteration > MaxIterations) {
LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations
<< " on " << F.getName()
<< " reached; stopping before reaching a fixpoint\n");
LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
<< F.getName() << "\n");
MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA,
IC.MaxArraySizeForCombine = MaxArraySize;
if (!
MadeIRChange = true;
return MadeIRChange;
InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {}
InstCombinePass::InstCombinePass(unsigned MaxIterations)
: MaxIterations(MaxIterations) {}
PreservedAnalyses InstCombinePass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
auto *LI = AM.getCachedResult<LoopAnalysis>(F);
auto *AA = &AM.getResult<AAManager>(F);
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
ProfileSummaryInfo *PSI =
auto *BFI = (PSI && PSI->hasProfileSummary()) ?
&AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
PSI, MaxIterations, LI))
// No changes, all analyses are preserved.
return PreservedAnalyses::all();
// Mark all the analyses that instcombine updates as preserved.
PreservedAnalyses PA;
return PA;
void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
bool InstructionCombiningPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
// Required analyses.
auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
// Optional analyses.
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
ProfileSummaryInfo *PSI =
BlockFrequencyInfo *BFI =
(PSI && PSI->hasProfileSummary()) ?
&getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
PSI, MaxIterations, LI);
char InstructionCombiningPass::ID = 0;
: FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) {
InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations)
: FunctionPass(ID), MaxIterations(MaxIterations) {
INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
"Combine redundant instructions", false, false)
INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
"Combine redundant instructions", false, false)
// Initialization Routines
void llvm::initializeInstCombine(PassRegistry &Registry) {
void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
FunctionPass *llvm::createInstructionCombiningPass() {
return new InstructionCombiningPass();
FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) {
return new InstructionCombiningPass(MaxIterations);
void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 9d0500419a7f..2f379b7f6160 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1,2968 +1,2976 @@
//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This file implements the Jump Threading pass.
#include "llvm/Transforms/Scalar/JumpThreading.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/GuardUtils.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/BlockFrequency.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <iterator>
#include <memory>
#include <utility>
using namespace llvm;
using namespace jumpthreading;
#define DEBUG_TYPE "jump-threading"
STATISTIC(NumThreads, "Number of jumps threaded");
STATISTIC(NumFolds, "Number of terminators folded");
STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi");
static cl::opt<unsigned>
cl::desc("Max block size to duplicate for jump threading"),
cl::init(6), cl::Hidden);
static cl::opt<unsigned>
cl::desc("The number of predecessors to search for a stronger "
"condition to use to thread over a weaker condition"),
cl::init(3), cl::Hidden);
static cl::opt<bool> PrintLVIAfterJumpThreading(
cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
static cl::opt<bool> ThreadAcrossLoopHeaders(
cl::desc("Allow JumpThreading to thread across loop headers, for testing"),
cl::init(false), cl::Hidden);
namespace {
/// This pass performs 'jump threading', which looks at blocks that have
/// multiple predecessors and multiple successors. If one or more of the
/// predecessors of the block can be proven to always jump to one of the
/// successors, we forward the edge from the predecessor to the successor by
/// duplicating the contents of this block.
/// An example of when this can occur is code like this:
/// if () { ...
/// X = 4;
/// }
/// if (X < 3) {
/// In this case, the unconditional branch at the end of the first if can be
/// revectored to the false side of the second if.
class JumpThreading : public FunctionPass {
JumpThreadingPass Impl;
static char ID; // Pass identification
JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
void releaseMemory() override { Impl.releaseMemory(); }
} // end anonymous namespace
char JumpThreading::ID = 0;
INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
"Jump Threading", false, false)
INITIALIZE_PASS_END(JumpThreading, "jump-threading",
"Jump Threading", false, false)
// Public interface to the Jump Threading pass
FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
return new JumpThreading(Threshold);
JumpThreadingPass::JumpThreadingPass(int T) {
DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
// Update branch probability information according to conditional
// branch probability. This is usually made possible for cloned branches
// in inline instances by the context specific profile in the caller.
// For instance,
// [Block PredBB]
// [Branch PredBr]
// if (t) {
// Block A;
// } else {
// Block B;
// }
// [Block BB]
// cond = PN([true, %A], [..., %B]); // PHI node
// [Branch CondBr]
// if (cond) {
// ... // P(cond == true) = 1%
// }
// Here we know that when block A is taken, cond must be true, which means
// P(cond == true | A) = 1
// Given that P(cond == true) = P(cond == true | A) * P(A) +
// P(cond == true | B) * P(B)
// we get:
// P(cond == true ) = P(A) + P(cond == true | B) * P(B)
// which gives us:
// P(A) is less than P(cond == true), i.e.
// P(t == true) <= P(cond == true)
// In other words, if we know P(cond == true) is unlikely, we know
// that P(t == true) is also unlikely.
static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
if (!CondBr)
uint64_t TrueWeight, FalseWeight;
if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
if (TrueWeight + FalseWeight == 0)
// Zero branch_weights do not give a hint for getting branch probabilities.
// Technically it would result in division by zero denominator, which is
// TrueWeight + FalseWeight.
// Returns the outgoing edge of the dominating predecessor block
// that leads to the PhiNode's incoming block:
auto GetPredOutEdge =
[](BasicBlock *IncomingBB,
BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
auto *PredBB = IncomingBB;
auto *SuccBB = PhiBB;
SmallPtrSet<BasicBlock *, 16> Visited;
while (true) {
BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
if (PredBr && PredBr->isConditional())
return {PredBB, SuccBB};
auto *SinglePredBB = PredBB->getSinglePredecessor();
if (!SinglePredBB)
return {nullptr, nullptr};
// Stop searching when SinglePredBB has been visited. It means we see
// an unreachable loop.
if (Visited.count(SinglePredBB))
return {nullptr, nullptr};
SuccBB = PredBB;
PredBB = SinglePredBB;
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
Value *PhiOpnd = PN->getIncomingValue(i);
ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
if (!CI || !CI->getType()->isIntegerTy(1))
BranchProbability BP =
(CI->isOne() ? BranchProbability::getBranchProbability(
TrueWeight, TrueWeight + FalseWeight)
: BranchProbability::getBranchProbability(
FalseWeight, TrueWeight + FalseWeight));
auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
if (!PredOutEdge.first)
BasicBlock *PredBB = PredOutEdge.first;
BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
if (!PredBr)
uint64_t PredTrueWeight, PredFalseWeight;
// FIXME: We currently only set the profile data when it is missing.
// With PGO, this can be used to refine even existing profile data with
// context information. This needs to be done after more performance
// testing.
if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight))
// We can not infer anything useful when BP >= 50%, because BP is the
// upper bound probability value.
if (BP >= BranchProbability(50, 100))
SmallVector<uint32_t, 2> Weights;
if (PredBr->getSuccessor(0) == PredOutEdge.second) {
} else {
/// runOnFunction - Toplevel algorithm.
bool JumpThreading::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
if (F.hasProfileData()) {
LoopInfo LI{DominatorTree(F)};
BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(),
std::move(BFI), std::move(BPI));
if (PrintLVIAfterJumpThreading) {
dbgs() << "LVI for function '" << F.getName() << "':\n";
LVI->printLVI(F, DTU.getDomTree(), dbgs());
return Changed;
PreservedAnalyses JumpThreadingPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &LVI = AM.getResult<LazyValueAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
if (F.hasProfileData()) {
LoopInfo LI{DominatorTree(F)};
BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
std::move(BFI), std::move(BPI));
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
return PA;
bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
LazyValueInfo *LVI_, AliasAnalysis *AA_,
DomTreeUpdater *DTU_, bool HasProfileData_,
std::unique_ptr<BlockFrequencyInfo> BFI_,
std::unique_ptr<BranchProbabilityInfo> BPI_) {
LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
AA = AA_;
// When profile data is available, we need to update edge weights after
// successful jump threading, which requires both BPI and BFI being available.
HasProfileData = HasProfileData_;
auto *GuardDecl = F.getParent()->getFunction(
HasGuards = GuardDecl && !GuardDecl->use_empty();
if (HasProfileData) {
BPI = std::move(BPI_);
BFI = std::move(BFI_);
// Reduce the number of instructions duplicated when optimizing strictly for
// size.
if (BBDuplicateThreshold.getNumOccurrences())
BBDupThreshold = BBDuplicateThreshold;
else if (F.hasFnAttribute(Attribute::MinSize))
BBDupThreshold = 3;
BBDupThreshold = DefaultBBDupThreshold;
// JumpThreading must not processes blocks unreachable from entry. It's a
// waste of compute time and can potentially lead to hangs.
SmallPtrSet<BasicBlock *, 16> Unreachable;
assert(DTU && "DTU isn't passed into JumpThreading before using it.");
assert(DTU->hasDomTree() && "JumpThreading relies on DomTree to proceed.");
DominatorTree &DT = DTU->getDomTree();
for (auto &BB : F)
if (!DT.isReachableFromEntry(&BB))
if (!ThreadAcrossLoopHeaders)
bool EverChanged = false;
bool Changed;
do {
Changed = false;
for (auto &BB : F) {
if (Unreachable.count(&BB))
while (ProcessBlock(&BB)) // Thread all of the branches we can over BB.
Changed = true;
// Jump threading may have introduced redundant debug values into BB
// which should be removed.
if (Changed)
// Stop processing BB if it's the entry or is now deleted. The following
// routines attempt to eliminate BB and locating a suitable replacement
// for the entry is non-trivial.
if (&BB == &F.getEntryBlock() || DTU->isBBPendingDeletion(&BB))
if (pred_empty(&BB)) {
// When ProcessBlock makes BB unreachable it doesn't bother to fix up
// the instructions in it. We must remove BB to prevent invalid IR.
LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName()
<< "' with terminator: " << *BB.getTerminator()
<< '\n');
DeleteDeadBlock(&BB, DTU);
Changed = true;
// ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB
// is "almost empty", we attempt to merge BB with its sole successor.
auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
if (BI && BI->isUnconditional()) {
BasicBlock *Succ = BI->getSuccessor(0);
if (
// The terminator must be the only non-phi instruction in BB.
BB.getFirstNonPHIOrDbg()->isTerminator() &&
// Don't alter Loop headers and latches to ensure another pass can
// detect and transform nested loops later.
!LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) &&
TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) {
// BB is valid for cleanup here because we passed in DTU. F remains
// BB's parent until a DTU->getDomTree() event.
Changed = true;
EverChanged |= Changed;
} while (Changed);
return EverChanged;
// Replace uses of Cond with ToVal when safe to do so. If all uses are
// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
// because we may incorrectly replace uses when guards/assumes are uses of
// of `Cond` and we used the guards/assume to reason about the `Cond` value
// at the end of block. RAUW unconditionally replaces all uses
// including the guards/assumes themselves and the uses before the
// guard/assume.
static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) {
assert(Cond->getType() == ToVal->getType());
auto *BB = Cond->getParent();
// We can unconditionally replace all uses in non-local blocks (i.e. uses
// strictly dominated by BB), since LVI information is true from the
// terminator of BB.
replaceNonLocalUsesWith(Cond, ToVal);
for (Instruction &I : reverse(*BB)) {
// Reached the Cond whose uses we are trying to replace, so there are no
// more uses.
if (&I == Cond)
// We only replace uses in instructions that are guaranteed to reach the end
// of BB, where we know Cond is ToVal.
if (!isGuaranteedToTransferExecutionToSuccessor(&I))
I.replaceUsesOfWith(Cond, ToVal);
if (Cond->use_empty() && !Cond->mayHaveSideEffects())
/// Return the cost of duplicating a piece of this block from first non-phi
/// and before StopAt instruction to thread across it. Stop scanning the block
/// when exceeding the threshold. If duplication is impossible, returns ~0U.
static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
Instruction *StopAt,
unsigned Threshold) {
assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
/// Ignore PHI nodes, these will be flattened when duplication happens.
BasicBlock::const_iterator I(BB->getFirstNonPHI());
// FIXME: THREADING will delete values that are just used to compute the
// branch, so they shouldn't count against the duplication cost.
unsigned Bonus = 0;
if (BB->getTerminator() == StopAt) {
// Threading through a switch statement is particularly profitable. If this
// block ends in a switch, decrease its cost to make it more likely to
// happen.
if (isa<SwitchInst>(StopAt))
Bonus = 6;
// The same holds for indirect branches, but slightly more so.
if (isa<IndirectBrInst>(StopAt))
Bonus = 8;
// Bump the threshold up so the early exit from the loop doesn't skip the
// terminator-based Size adjustment at the end.
Threshold += Bonus;
// Sum up the cost of each instruction until we get to the terminator. Don't
// include the terminator because the copy won't include it.
unsigned Size = 0;
for (; &*I != StopAt; ++I) {
// Stop scanning the block if we've reached the threshold.
if (Size > Threshold)
return Size;
// Debugger intrinsics don't incur code size.
if (isa<DbgInfoIntrinsic>(I)) continue;
// If this is a pointer->pointer bitcast, it is free.
if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
// Bail out if this instruction gives back a token type, it is not possible
// to duplicate it if it is used outside this BB.
if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
return ~0U;
// All other instructions count for at least one unit.
// Calls are more expensive. If they are non-intrinsic calls, we model them
// as having cost of 4. If they are a non-vector intrinsic, we model them
// as having cost of 2 total, and if they are a vector intrinsic, we model
// them as having cost 1.
if (const CallInst *CI = dyn_cast<CallInst>(I)) {
if (CI->cannotDuplicate() || CI->isConvergent())
// Blocks with NoDuplicate are modelled as having infinite cost, so they
// are never duplicated.
return ~0U;
else if (!isa<IntrinsicInst>(CI))
Size += 3;
else if (!CI->getType()->isVectorTy())
Size += 1;
return Size > Bonus ? Size - Bonus : 0;
/// FindLoopHeaders - We do not want jump threading to turn proper loop
/// structures into irreducible loops. Doing this breaks up the loop nesting
/// hierarchy and pessimizes later transformations. To prevent this from
/// happening, we first have to find the loop headers. Here we approximate this
/// by finding targets of backedges in the CFG.
/// Note that there definitely are cases when we want to allow threading of
/// edges across a loop header. For example, threading a jump from outside the
/// loop (the preheader) to an exit block of the loop is definitely profitable.
/// It is also almost always profitable to thread backedges from within the loop
/// to exit blocks, and is often profitable to thread backedges to other blocks
/// within the loop (forming a nested loop). This simple analysis is not rich
/// enough to track all of these properties and keep it up-to-date as the CFG
/// mutates, so we don't allow any of these transformations.
void JumpThreadingPass::FindLoopHeaders(Function &F) {
SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
FindFunctionBackedges(F, Edges);
for (const auto &Edge : Edges)
/// getKnownConstant - Helper method to determine if we can thread over a
/// terminator with the given value as its condition, and if so what value to
/// use for that. What kind of value this is depends on whether we want an
/// integer or a block address, but an undef is always accepted.
/// Returns null if Val is null or not an appropriate constant.
static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
if (!Val)
return nullptr;
// Undef is "known" enough.
if (UndefValue *U = dyn_cast<UndefValue>(Val))
return U;
if (Preference == WantBlockAddress)
return dyn_cast<BlockAddress>(Val->stripPointerCasts());
return dyn_cast<ConstantInt>(Val);
/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
/// in any of our predecessors. If so, return the known list of value and pred
/// BB in the result vector.
/// This returns true if there were any known values.
bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
Value *V, BasicBlock *BB, PredValueInfo &Result,
ConstantPreference Preference, DenseSet<Value *> &RecursionSet,
Instruction *CxtI) {
// This method walks up use-def chains recursively. Because of this, we could
// get into an infinite loop going around loops in the use-def chain. To
// prevent this, keep track of what (value, block) pairs we've already visited
// and terminate the search if we loop back to them
if (!RecursionSet.insert(V).second)
return false;
// If V is a constant, then it is known in all predecessors.
if (Constant *KC = getKnownConstant(V, Preference)) {
for (BasicBlock *Pred : predecessors(BB))
Result.emplace_back(KC, Pred);
return !Result.empty();
// If V is a non-instruction value, or an instruction in a different block,
// then it can't be derived from a PHI.
Instruction *I = dyn_cast<Instruction>(V);
if (!I || I->getParent() != BB) {
// Okay, if this is a live-in value, see if it has a known value at the end
// of any of our predecessors.
// FIXME: This should be an edge property, not a block end property.
/// TODO: Per PR2563, we could infer value range information about a
/// predecessor based on its terminator.
// FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
// "I" is a non-local compare-with-a-constant instruction. This would be
// able to handle value inequalities better, for example if the compare is
// "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
// Perhaps getConstantOnEdge should be smart enough to do this?
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a constant in a
// predecessor, use that information to try to thread this block.
Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
if (Constant *KC = getKnownConstant(PredCst, Preference))
Result.emplace_back(KC, P);
return !Result.empty();
/// If I is a PHI node, then we know the incoming values for any constants.
if (PHINode *PN = dyn_cast<PHINode>(I)) {
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
Value *InVal = PN->getIncomingValue(i);
if (Constant *KC = getKnownConstant(InVal, Preference)) {
Result.emplace_back(KC, PN->getIncomingBlock(i));
} else {
Constant *CI = LVI->getConstantOnEdge(InVal,
BB, CxtI);
if (Constant *KC = getKnownConstant(CI, Preference))
Result.emplace_back(KC, PN->getIncomingBlock(i));
return !Result.empty();
// Handle Cast instructions. Only see through Cast when the source operand is
// PHI or Cmp to save the compilation time.
if (CastInst *CI = dyn_cast<CastInst>(I)) {
Value *Source = CI->getOperand(0);
if (!isa<PHINode>(Source) && !isa<CmpInst>(Source))
return false;
ComputeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
RecursionSet, CxtI);
if (Result.empty())
return false;
// Convert the known values.
for (auto &R : Result)
R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType());
return true;
// Handle some boolean conditions.
if (I->getType()->getPrimitiveSizeInBits() == 1) {
assert(Preference == WantInteger && "One-bit non-integer type?");
// X | true -> true
// X & false -> false
if (I->getOpcode() == Instruction::Or ||
I->getOpcode() == Instruction::And) {
PredValueInfoTy LHSVals, RHSVals;
ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
WantInteger, RecursionSet, CxtI);
ComputeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
WantInteger, RecursionSet, CxtI);
if (LHSVals.empty() && RHSVals.empty())
return false;
ConstantInt *InterestingVal;
if (I->getOpcode() == Instruction::Or)
InterestingVal = ConstantInt::getTrue(I->getContext());
InterestingVal = ConstantInt::getFalse(I->getContext());
SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
// Scan for the sentinel. If we find an undef, force it to the
// interesting value: x|undef -> true and x&undef -> false.
for (const auto &LHSVal : LHSVals)
if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
Result.emplace_back(InterestingVal, LHSVal.second);
for (const auto &RHSVal : RHSVals)
if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
// If we already inferred a value for this block on the LHS, don't
// re-add it.
if (!LHSKnownBBs.count(RHSVal.second))
Result.emplace_back(InterestingVal, RHSVal.second);
return !Result.empty();
// Handle the NOT form of XOR.
if (I->getOpcode() == Instruction::Xor &&
isa<ConstantInt>(I->getOperand(1)) &&
cast<ConstantInt>(I->getOperand(1))->isOne()) {
ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
WantInteger, RecursionSet, CxtI);
if (Result.empty())
return false;
// Invert the known values.
for (auto &R : Result)
R.first = ConstantExpr::getNot(R.first);
return true;
// Try to simplify some other binary operator values.
} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
assert(Preference != WantBlockAddress
&& "A binary operator creating a block address?");
if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
PredValueInfoTy LHSVals;
ComputeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
WantInteger, RecursionSet, CxtI);
// Try to use constant folding to simplify the binary operator.
for (const auto &LHSVal : LHSVals) {
Constant *V = LHSVal.first;
Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
if (Constant *KC = getKnownConstant(Folded, WantInteger))
Result.emplace_back(KC, LHSVal.second);
return !Result.empty();
// Handle compare with phi operand, where the PHI is defined in this block.
if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
assert(Preference == WantInteger && "Compares only produce integers");
Type *CmpType = Cmp->getType();
Value *CmpLHS = Cmp->getOperand(0);
Value *CmpRHS = Cmp->getOperand(1);
CmpInst::Predicate Pred = Cmp->getPredicate();
PHINode *PN = dyn_cast<PHINode>(CmpLHS);
if (!PN)
PN = dyn_cast<PHINode>(CmpRHS);
if (PN && PN->getParent() == BB) {
const DataLayout &DL = PN->getModule()->getDataLayout();
// We can do this simplification if any comparisons fold to true or false.
// See if any do.
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
BasicBlock *PredBB = PN->getIncomingBlock(i);
Value *LHS, *RHS;
if (PN == CmpLHS) {
LHS = PN->getIncomingValue(i);
RHS = CmpRHS->DoPHITranslation(BB, PredBB);
} else {
LHS = CmpLHS->DoPHITranslation(BB, PredBB);
RHS = PN->getIncomingValue(i);
Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
if (!Res) {
if (!isa<Constant>(RHS))
// getPredicateOnEdge call will make no sense if LHS is defined in BB.
auto LHSInst = dyn_cast<Instruction>(LHS);
if (LHSInst && LHSInst->getParent() == BB)
ResT = LVI->getPredicateOnEdge(Pred, LHS,
cast<Constant>(RHS), PredBB, BB,
CxtI ? CxtI : Cmp);
if (ResT == LazyValueInfo::Unknown)
Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
if (Constant *KC = getKnownConstant(Res, WantInteger))
Result.emplace_back(KC, PredBB);
return !Result.empty();
// If comparing a live-in value against a constant, see if we know the
// live-in value on any predecessors.
if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
Constant *CmpConst = cast<Constant>(CmpRHS);
if (!isa<Instruction>(CmpLHS) ||
cast<Instruction>(CmpLHS)->getParent() != BB) {
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a constant in a
// predecessor, use that information to try to thread this block.
LazyValueInfo::Tristate Res =
LVI->getPredicateOnEdge(Pred, CmpLHS,
CmpConst, P, BB, CxtI ? CxtI : Cmp);
if (Res == LazyValueInfo::Unknown)
Constant *ResC = ConstantInt::get(CmpType, Res);
Result.emplace_back(ResC, P);
return !Result.empty();
// InstCombine can fold some forms of constant range checks into
// (icmp (add (x, C1)), C2). See if we have we have such a thing with
// x as a live-in.
using namespace PatternMatch;
Value *AddLHS;
ConstantInt *AddConst;
if (isa<ConstantInt>(CmpConst) &&
match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
if (!isa<Instruction>(AddLHS) ||
cast<Instruction>(AddLHS)->getParent() != BB) {
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a ConstantRange in
// a predecessor, use that information to try to thread this
// block.
ConstantRange CR = LVI->getConstantRangeOnEdge(
AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
// Propagate the range through the addition.
CR = CR.add(AddConst->getValue());
// Get the range where the compare returns true.
ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
Pred, cast<ConstantInt>(CmpConst)->getValue());
Constant *ResC;
if (CmpRange.contains(CR))
ResC = ConstantInt::getTrue(CmpType);
else if (CmpRange.inverse().contains(CR))
ResC = ConstantInt::getFalse(CmpType);
Result.emplace_back(ResC, P);
return !Result.empty();
// Try to find a constant value for the LHS of a comparison,
// and evaluate it statically if we can.
PredValueInfoTy LHSVals;
ComputeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
WantInteger, RecursionSet, CxtI);
for (const auto &LHSVal : LHSVals) {
Constant *V = LHSVal.first;
Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
if (Constant *KC = getKnownConstant(Folded, WantInteger))
Result.emplace_back(KC, LHSVal.second);
return !Result.empty();
if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
// Handle select instructions where at least one operand is a known constant
// and we can figure out the condition value for any predecessor block.
Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
PredValueInfoTy Conds;
if ((TrueVal || FalseVal) &&
ComputeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
WantInteger, RecursionSet, CxtI)) {
for (auto &C : Conds) {
Constant *Cond = C.first;
// Figure out what value to use for the condition.
bool KnownCond;
if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
// A known boolean.
KnownCond = CI->isOne();
} else {
assert(isa<UndefValue>(Cond) && "Unexpected condition value");
// Either operand will do, so be sure to pick the one that's a known
// constant.
// FIXME: Do this more cleverly if both values are known constants?
KnownCond = (TrueVal != nullptr);
// See if the select has a known constant value for this predecessor.
if (Constant *Val = KnownCond ? TrueVal : FalseVal)
Result.emplace_back(Val, C.second);
return !Result.empty();
// If all else fails, see if LVI can figure out a constant value for us.
Constant *CI = LVI->getConstant(V, BB, CxtI);
if (Constant *KC = getKnownConstant(CI, Preference)) {
for (BasicBlock *Pred : predecessors(BB))
Result.emplace_back(KC, Pred);
return !Result.empty();
/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
/// in an undefined jump, decide which block is best to revector to.
/// Since we can pick an arbitrary destination, we pick the successor with the
/// fewest predecessors. This should reduce the in-degree of the others.
static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
Instruction *BBTerm = BB->getTerminator();
unsigned MinSucc = 0;
BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
// Compute the successor with the minimum number of predecessors.
unsigned MinNumPreds = pred_size(TestBB);
for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
TestBB = BBTerm->getSuccessor(i);
unsigned NumPreds = pred_size(TestBB);
if (NumPreds < MinNumPreds) {
MinSucc = i;
MinNumPreds = NumPreds;
return MinSucc;
static bool hasAddressTakenAndUsed(BasicBlock *BB) {
if (!BB->hasAddressTaken()) return false;
// If the block has its address taken, it may be a tree of dead constants
// hanging off of it. These shouldn't keep the block alive.
BlockAddress *BA = BlockAddress::get(BB);
return !BA->use_empty();
/// ProcessBlock - If there are any predecessors whose control can be threaded
/// through to a successor, transform them now.
bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// If the block is trivially dead, just return and let the caller nuke it.
// This simplifies other transformations.
if (DTU->isBBPendingDeletion(BB) ||
(pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()))
return false;
// If this block has a single predecessor, and if that pred has a single
// successor, merge the blocks. This encourages recursive jump threading
// because now the condition in this block can be threaded through
// predecessors of our predecessor block.
if (MaybeMergeBasicBlockIntoOnlyPred(BB))
return true;
if (TryToUnfoldSelectInCurrBB(BB))
return true;
// Look if we can propagate guards to predecessors.
if (HasGuards && ProcessGuards(BB))
return true;
// What kind of constant we're looking for.
ConstantPreference Preference = WantInteger;
// Look to see if the terminator is a conditional branch, switch or indirect
// branch, if not we can't thread it.
Value *Condition;
Instruction *Terminator = BB->getTerminator();
if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
// Can't thread an unconditional jump.
if (BI->isUnconditional()) return false;
Condition = BI->getCondition();
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
Condition = SI->getCondition();
} else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
// Can't thread indirect branch with no successors.
if (IB->getNumSuccessors() == 0) return false;
Condition = IB->getAddress()->stripPointerCasts();
Preference = WantBlockAddress;
} else {
return false; // Must be an invoke or callbr.
// Run constant folding to see if we can reduce the condition to a simple
// constant.
if (Instruction *I = dyn_cast<Instruction>(Condition)) {
Value *SimpleVal =
ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
if (SimpleVal) {
if (isInstructionTriviallyDead(I, TLI))
Condition = SimpleVal;
// If the terminator is branching on an undef, we can pick any of the
// successors to branch to. Let GetBestDestForJumpOnUndef decide.
if (isa<UndefValue>(Condition)) {
unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
std::vector<DominatorTree::UpdateType> Updates;
// Fold the branch/switch.
Instruction *BBTerm = BB->getTerminator();
for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
if (i == BestSucc) continue;
BasicBlock *Succ = BBTerm->getSuccessor(i);
Succ->removePredecessor(BB, true);
Updates.push_back({DominatorTree::Delete, BB, Succ});
LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
<< "' folding undef terminator: " << *BBTerm << '\n');
BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
return true;
// If the terminator of this block is branching on a constant, simplify the
// terminator to an unconditional branch. This can occur due to threading in
// other blocks.
if (getKnownConstant(Condition, Preference)) {
LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
<< "' folding terminator: " << *BB->getTerminator()
<< '\n');
ConstantFoldTerminator(BB, true, nullptr, DTU);
return true;
Instruction *CondInst = dyn_cast<Instruction>(Condition);
// All the rest of our checks depend on the condition being an instruction.
if (!CondInst) {
// FIXME: Unify this with code below.
if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
return true;
return false;
if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
// If we're branching on a conditional, LVI might be able to determine
// it's value at the branch instruction. We only handle comparisons
// against a constant at this time.
// TODO: This should be extended to handle switches as well.
BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
if (CondBr && CondConst) {
// We should have returned as soon as we turn a conditional branch to
// unconditional. Because its no longer interesting as far as jump
// threading is concerned.
assert(CondBr->isConditional() && "Threading on unconditional terminator");
LazyValueInfo::Tristate Ret =
LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
CondConst, CondBr);
if (Ret != LazyValueInfo::Unknown) {
unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
ToRemoveSucc->removePredecessor(BB, true);
BranchInst *UncondBr =
BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
if (CondCmp->use_empty())
// We can safely replace *some* uses of the CondInst if it has
// exactly one value as returned by LVI. RAUW is incorrect in the
// presence of guards and assumes, that have the `Cond` as the use. This
// is because we use the guards/assume to reason about the `Cond` value
// at the end of block, but RAUW unconditionally replaces all uses
// including the guards/assumes themselves and the uses before the
// guard/assume.
else if (CondCmp->getParent() == BB) {
auto *CI = Ret == LazyValueInfo::True ?
ConstantInt::getTrue(CondCmp->getType()) :
ReplaceFoldableUses(CondCmp, CI);
{{DominatorTree::Delete, BB, ToRemoveSucc}});
return true;
// We did not manage to simplify this branch, try to see whether
// CondCmp depends on a known phi-select pattern.
if (TryToUnfoldSelect(CondCmp, BB))
return true;
if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
if (TryToUnfoldSelect(SI, BB))
return true;
// Check for some cases that are worth simplifying. Right now we want to look
// for loads that are used by a switch or by the condition for the branch. If
// we see one, check to see if it's partially redundant. If so, insert a PHI
// which can then be used to thread the values.
Value *SimplifyValue = CondInst;
if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
if (isa<Constant>(CondCmp->getOperand(1)))
SimplifyValue = CondCmp->getOperand(0);
// TODO: There are other places where load PRE would be profitable, such as
// more complex comparisons.
if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
if (SimplifyPartiallyRedundantLoad(LoadI))
return true;
// Before threading, try to propagate profile data backwards:
if (PHINode *PN = dyn_cast<PHINode>(CondInst))
if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
updatePredecessorProfileMetadata(PN, BB);
// Handle a variety of cases where we are branching on something derived from
// a PHI node in the current block. If we can prove that any predecessors
// compute a predictable value based on a PHI node, thread those predecessors.
if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
return true;
// If this is an otherwise-unfoldable branch on a phi node in the current
// block, see if we can simplify.
if (PHINode *PN = dyn_cast<PHINode>(CondInst))
if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
return ProcessBranchOnPHI(PN);
// If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
if (CondInst->getOpcode() == Instruction::Xor &&
CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
// Search for a stronger dominating condition that can be used to simplify a
// conditional branch leaving BB.
if (ProcessImpliedCondition(BB))
return true;
return false;
bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
if (!BI || !BI->isConditional())
return false;
Value *Cond = BI->getCondition();
BasicBlock *CurrentBB = BB;
BasicBlock *CurrentPred = BB->getSinglePredecessor();
unsigned Iter = 0;
auto &DL = BB->getModule()->getDataLayout();
while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
if (!PBI || !PBI->isConditional())
return false;
if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
return false;
bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
Optional<bool> Implication =
isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
if (Implication) {
BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI);
DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}});
return true;
CurrentBB = CurrentPred;
CurrentPred = CurrentBB->getSinglePredecessor();
return false;
/// Return true if Op is an instruction defined in the given block.
static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
if (Instruction *OpInst = dyn_cast<Instruction>(Op))
if (OpInst->getParent() == BB)
return true;
return false;
/// SimplifyPartiallyRedundantLoad - If LoadI is an obviously partially
/// redundant load instruction, eliminate it by replacing it with a PHI node.
/// This is an important optimization that encourages jump threading, and needs
/// to be run interlaced with other jump threading tasks.
bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
// Don't hack volatile and ordered loads.
if (!LoadI->isUnordered()) return false;
// If the load is defined in a block with exactly one predecessor, it can't be
// partially redundant.
BasicBlock *LoadBB = LoadI->getParent();
if (LoadBB->getSinglePredecessor())
return false;
// If the load is defined in an EH pad, it can't be partially redundant,
// because the edges between the invoke and the EH pad cannot have other
// instructions between them.
if (LoadBB->isEHPad())
return false;
Value *LoadedPtr = LoadI->getOperand(0);
// If the loaded operand is defined in the LoadBB and its not a phi,
// it can't be available in predecessors.
if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
return false;
// Scan a few instructions up from the load, to see if it is obviously live at
// the entry to its block.
BasicBlock::iterator BBIt(LoadI);
bool IsLoadCSE;
if (Value *AvailableVal = FindAvailableLoadedValue(
LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
// If the value of the load is locally available within the block, just use
// it. This frequently occurs for reg2mem'd allocas.
if (IsLoadCSE) {
LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
combineMetadataForCSE(NLoadI, LoadI, false);
// If the returned value is the load itself, replace with an undef. This can
// only happen in dead loops.
if (AvailableVal == LoadI)
AvailableVal = UndefValue::get(LoadI->getType());
if (AvailableVal->getType() != LoadI->getType())
AvailableVal = CastInst::CreateBitOrPointerCast(
AvailableVal, LoadI->getType(), "", LoadI);
return true;
// Otherwise, if we scanned the whole block and got to the top of the block,
// we know the block is locally transparent to the load. If not, something
// might clobber its value.
if (BBIt != LoadBB->begin())
return false;
// If all of the loads and stores that feed the value have the same AA tags,
// then we can propagate them onto any newly inserted loads.
AAMDNodes AATags;
SmallPtrSet<BasicBlock*, 8> PredsScanned;
using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>;
AvailablePredsTy AvailablePreds;
BasicBlock *OneUnavailablePred = nullptr;
SmallVector<LoadInst*, 8> CSELoads;
// If we got here, the loaded value is transparent through to the start of the
// block. Check to see if it is available in any of the predecessor blocks.
for (BasicBlock *PredBB : predecessors(LoadBB)) {
// If we already scanned this predecessor, skip it.
if (!PredsScanned.insert(PredBB).second)
BBIt = PredBB->end();
unsigned NumScanedInst = 0;
Value *PredAvailable = nullptr;
// NOTE: We don't CSE load that is volatile or anything stronger than
// unordered, that should have been checked when we entered the function.
assert(LoadI->isUnordered() &&
"Attempting to CSE volatile or atomic loads");
// If this is a load on a phi pointer, phi-translate it and search
// for available load/store to the pointer in predecessors.
Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
PredAvailable = FindAvailablePtrLoadStore(
Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt,
DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst);
// If PredBB has a single predecessor, continue scanning through the
// single predecessor.
BasicBlock *SinglePredBB = PredBB;
while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
NumScanedInst < DefMaxInstsToScan) {
SinglePredBB = SinglePredBB->getSinglePredecessor();
if (SinglePredBB) {
BBIt = SinglePredBB->end();
PredAvailable = FindAvailablePtrLoadStore(
Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt,
(DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
if (!PredAvailable) {
OneUnavailablePred = PredBB;
if (IsLoadCSE)
// If so, this load is partially redundant. Remember this info so that we
// can create a PHI node.
AvailablePreds.emplace_back(PredBB, PredAvailable);
// If the loaded value isn't available in any predecessor, it isn't partially
// redundant.
if (AvailablePreds.empty()) return false;
// Okay, the loaded value is available in at least one (and maybe all!)
// predecessors. If the value is unavailable in more than one unique
// predecessor, we want to insert a merge block for those common predecessors.
// This ensures that we only have to insert one reload, thus not increasing
// code size.
BasicBlock *UnavailablePred = nullptr;
// If the value is unavailable in one of predecessors, we will end up
// inserting a new instruction into them. It is only valid if all the
// instructions before LoadI are guaranteed to pass execution to its
// successor, or if LoadI is safe to speculate.
// TODO: If this logic becomes more complex, and we will perform PRE insertion
// farther than to a predecessor, we need to reuse the code from GVN's PRE.
// It requires domination tree analysis, so for this simple case it is an
// overkill.
if (PredsScanned.size() != AvailablePreds.size() &&
for (auto I = LoadBB->begin(); &*I != LoadI; ++I)
if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
return false;
// If there is exactly one predecessor where the value is unavailable, the
// already computed 'OneUnavailablePred' block is it. If it ends in an
// unconditional branch, we know that it isn't a critical edge.
if (PredsScanned.size() == AvailablePreds.size()+1 &&
OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
UnavailablePred = OneUnavailablePred;
} else if (PredsScanned.size() != AvailablePreds.size()) {
// Otherwise, we had multiple unavailable predecessors or we had a critical
// edge from the one.
SmallVector<BasicBlock*, 8> PredsToSplit;
SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
for (const auto &AvailablePred : AvailablePreds)
// Add all the unavailable predecessors to the PredsToSplit list.
for (BasicBlock *P : predecessors(LoadBB)) {
// If the predecessor is an indirect goto, we can't split the edge.
// Same for CallBr.
if (isa<IndirectBrInst>(P->getTerminator()) ||
return false;
if (!AvailablePredSet.count(P))
// Split them out to their own block.
UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
// If the value isn't available in all predecessors, then there will be
// exactly one where it isn't available. Insert a load on that edge and add
// it to the AvailablePreds list.
if (UnavailablePred) {
assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
"Can't handle critical edge here!");
LoadInst *NewVal = new LoadInst(
LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
LoadI->getName() + ".pr", false, LoadI->getAlign(),
LoadI->getOrdering(), LoadI->getSyncScopeID(),
if (AATags)
AvailablePreds.emplace_back(UnavailablePred, NewVal);
// Now we know that each predecessor of this block has a value in
// AvailablePreds, sort them for efficient access as we're walking the preds.
array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
// Create a PHI node at the start of the block for the PRE'd load value.
pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "",
// Insert new entries into the PHI for each predecessor. A single block may
// have multiple entries here.
for (pred_iterator PI = PB; PI != PE; ++PI) {
BasicBlock *P = *PI;
AvailablePredsTy::iterator I =
llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr));
assert(I != AvailablePreds.end() && I->first == P &&
"Didn't find entry for predecessor!");
// If we have an available predecessor but it requires casting, insert the
// cast in the predecessor and use the cast. Note that we have to update the
// AvailablePreds vector as we go so that all of the PHI entries for this
// predecessor use the same bitcast.
Value *&PredV = I->second;
if (PredV->getType() != LoadI->getType())
PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "",
PN->addIncoming(PredV, I->first);
for (LoadInst *PredLoadI : CSELoads) {
combineMetadataForCSE(PredLoadI, LoadI, true);
return true;
/// FindMostPopularDest - The specified list contains multiple possible
/// threadable destinations. Pick the one that occurs the most frequently in
/// the list.
static BasicBlock *
FindMostPopularDest(BasicBlock *BB,
const SmallVectorImpl<std::pair<BasicBlock *,
BasicBlock *>> &PredToDestList) {
// Determine popularity. If there are multiple possible destinations, we
// explicitly choose to ignore 'undef' destinations. We prefer to thread
// blocks with known and real destinations to threading undef. We'll handle
// them later if interesting.
MapVector<BasicBlock *, unsigned> DestPopularity;
// Populate DestPopularity with the successors in the order they appear in the
// successor list. This way, we ensure determinism by iterating it in the
// same order in std::max_element below. We map nullptr to 0 so that we can
// return nullptr when PredToDestList contains nullptr only.
DestPopularity[nullptr] = 0;
for (auto *SuccBB : successors(BB))
DestPopularity[SuccBB] = 0;
for (const auto &PredToDest : PredToDestList)
if (PredToDest.second)
// Find the most popular dest.
using VT = decltype(DestPopularity)::value_type;
auto MostPopular = std::max_element(
DestPopularity.begin(), DestPopularity.end(),
[](const VT &L, const VT &R) { return L.second < R.second; });
// Okay, we have finally picked the most popular destination.
return MostPopular->first;
// Try to evaluate the value of V when the control flows from PredPredBB to
// BB->getSinglePredecessor() and then on to BB.
Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB,
BasicBlock *PredPredBB,
Value *V) {
BasicBlock *PredBB = BB->getSinglePredecessor();
assert(PredBB && "Expected a single predecessor");
if (Constant *Cst = dyn_cast<Constant>(V)) {
return Cst;
// Consult LVI if V is not an instruction in BB or PredBB.
Instruction *I = dyn_cast<Instruction>(V);
if (!I || (I->getParent() != BB && I->getParent() != PredBB)) {
return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr);
// Look into a PHI argument.
if (PHINode *PHI = dyn_cast<PHINode>(V)) {
if (PHI->getParent() == PredBB)
return dyn_cast<Constant>(PHI->getIncomingValueForBlock(PredPredBB));
return nullptr;
// If we have a CmpInst, try to fold it for each incoming edge into PredBB.
if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
if (CondCmp->getParent() == BB) {
Constant *Op0 =
EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
Constant *Op1 =
EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
if (Op0 && Op1) {
return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
return nullptr;
return nullptr;
bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
ConstantPreference Preference,
Instruction *CxtI) {
// If threading this would thread across a loop header, don't even try to
// thread the edge.
if (LoopHeaders.count(BB))
return false;
PredValueInfoTy PredValues;
if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
CxtI)) {
// We don't have known values in predecessors. See if we can thread through
// BB and its sole predecessor.
return MaybeThreadThroughTwoBasicBlocks(BB, Cond);
assert(!PredValues.empty() &&
"ComputeValueKnownInPredecessors returned true with no values");
LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
for (const auto &PredValue : PredValues) {
dbgs() << " BB '" << BB->getName()
<< "': FOUND condition = " << *PredValue.first
<< " for pred '" << PredValue.second->getName() << "'.\n";
// Decide what we want to thread through. Convert our list of known values to
// a list of known destinations for each pred. This also discards duplicate
// predecessors and keeps track of the undefined inputs (which are represented
// as a null dest in the PredToDestList).
SmallPtrSet<BasicBlock*, 16> SeenPreds;
SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
BasicBlock *OnlyDest = nullptr;
BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
Constant *OnlyVal = nullptr;
Constant *MultipleVal = (Constant *)(intptr_t)~0ULL;
for (const auto &PredValue : PredValues) {
BasicBlock *Pred = PredValue.second;
if (!SeenPreds.insert(Pred).second)
continue; // Duplicate predecessor entry.
Constant *Val = PredValue.first;
BasicBlock *DestBB;
if (isa<UndefValue>(Val))
DestBB = nullptr;
else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
} else {
&& "Unexpected terminator");
assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
DestBB = cast<BlockAddress>(Val)->getBasicBlock();
// If we have exactly one destination, remember it for efficiency below.
if (PredToDestList.empty()) {
OnlyDest = DestBB;
OnlyVal = Val;
} else {
if (OnlyDest != DestBB)
OnlyDest = MultipleDestSentinel;
// It possible we have same destination, but different value, e.g. default
// case in switchinst.
if (Val != OnlyVal)
OnlyVal = MultipleVal;
// If the predecessor ends with an indirect goto, we can't change its
// destination. Same for CallBr.
if (isa<IndirectBrInst>(Pred->getTerminator()) ||
PredToDestList.emplace_back(Pred, DestBB);
// If all edges were unthreadable, we fail.
if (PredToDestList.empty())
return false;
// If all the predecessors go to a single known successor, we want to fold,
// not thread. By doing so, we do not need to duplicate the current block and
// also miss potential opportunities in case we dont/cant duplicate.
if (OnlyDest && OnlyDest != MultipleDestSentinel) {
if (BB->hasNPredecessors(PredToDestList.size())) {
bool SeenFirstBranchToOnlyDest = false;
std::vector <DominatorTree::UpdateType> Updates;
Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1);
for (BasicBlock *SuccBB : successors(BB)) {
if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) {
SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
} else {
SuccBB->removePredecessor(BB, true); // This is unreachable successor.
Updates.push_back({DominatorTree::Delete, BB, SuccBB});
// Finally update the terminator.
Instruction *Term = BB->getTerminator();
BranchInst::Create(OnlyDest, Term);
// If the condition is now dead due to the removal of the old terminator,
// erase it.
if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
// We can safely replace *some* uses of the CondInst if it has
// exactly one value as returned by LVI. RAUW is incorrect in the
// presence of guards and assumes, that have the `Cond` as the use. This
// is because we use the guards/assume to reason about the `Cond` value
// at the end of block, but RAUW unconditionally replaces all uses
// including the guards/assumes themselves and the uses before the
// guard/assume.
else if (OnlyVal && OnlyVal != MultipleVal &&
CondInst->getParent() == BB)
ReplaceFoldableUses(CondInst, OnlyVal);
return true;
// Determine which is the most common successor. If we have many inputs and
// this block is a switch, we want to start by threading the batch that goes
// to the most popular destination first. If we only know about one
// threadable destination (the common case) we can avoid this.
BasicBlock *MostPopularDest = OnlyDest;
if (MostPopularDest == MultipleDestSentinel) {
// Remove any loop headers from the Dest list, ThreadEdge conservatively
// won't process them, but we might have other destination that are eligible
// and we still want to process.
[&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) {
return LoopHeaders.count(PredToDest.second) != 0;
if (PredToDestList.empty())
return false;
MostPopularDest = FindMostPopularDest(BB, PredToDestList);
// Now that we know what the most popular destination is, factor all
// predecessors that will jump to it into a single predecessor.
SmallVector<BasicBlock*, 16> PredsToFactor;
for (const auto &PredToDest : PredToDestList)
if (PredToDest.second == MostPopularDest) {
BasicBlock *Pred = PredToDest.first;
// This predecessor may be a switch or something else that has multiple
// edges to the block. Factor each of these edges by listing them
// according to # occurrences in PredsToFactor.
for (BasicBlock *Succ : successors(Pred))
if (Succ == BB)
// If the threadable edges are branching on an undefined value, we get to pick
// the destination that these predecessors should get to.
if (!MostPopularDest)
MostPopularDest = BB->getTerminator()->
// Ok, try to thread it!
return TryThreadEdge(BB, PredsToFactor, MostPopularDest);
/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
/// a PHI node in the current block. See if there are any simplifications we
/// can do based on inputs to the phi node.
bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
BasicBlock *BB = PN->getParent();
// TODO: We could make use of this to do it once for blocks with common PHI
// values.
SmallVector<BasicBlock*, 1> PredBBs;
// If any of the predecessor blocks end in an unconditional branch, we can
// *duplicate* the conditional branch into that block in order to further
// encourage jump threading and to eliminate cases where we have branch on a
// phi of an icmp (branch on icmp is much better).
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
BasicBlock *PredBB = PN->getIncomingBlock(i);
if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
if (PredBr->isUnconditional()) {
PredBBs[0] = PredBB;
// Try to duplicate BB into PredBB.
if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
return true;
return false;
/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
/// a xor instruction in the current block. See if there are any
/// simplifications we can do based on inputs to the xor.
bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
BasicBlock *BB = BO->getParent();
// If either the LHS or RHS of the xor is a constant, don't do this
// optimization.
if (isa<ConstantInt>(BO->getOperand(0)) ||
return false;
// If the first instruction in BB isn't a phi, we won't be able to infer
// anything special about any particular predecessor.
if (!isa<PHINode>(BB->front()))
return false;
// If this BB is a landing pad, we won't be able to split the edge into it.
if (BB->isEHPad())
return false;
// If we have a xor as the branch input to this block, and we know that the
// LHS or RHS of the xor in any predecessor is true/false, then we can clone
// the condition into the predecessor and fix that value to true, saving some
// logical ops on that path and encouraging other paths to simplify.
// This copies something like this:
// BB:
// %X = phi i1 [1], [%X']
// %Y = icmp eq i32 %A, %B
// %Z = xor i1 %X, %Y
// br i1 %Z, ...
// Into:
// BB':
// %Y = icmp ne i32 %A, %B
// br i1 %Y, ...
PredValueInfoTy XorOpValues;
bool isLHS = true;
if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
WantInteger, BO)) {
if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
WantInteger, BO))
return false;
isLHS = false;
assert(!XorOpValues.empty() &&
"ComputeValueKnownInPredecessors returned true with no values");
// Scan the information to see which is most popular: true or false. The
// predecessors can be of the set true, false, or undef.
unsigned NumTrue = 0, NumFalse = 0;
for (const auto &XorOpValue : XorOpValues) {
if (isa<UndefValue>(XorOpValue.first))
// Ignore undefs for the count.
if (cast<ConstantInt>(XorOpValue.first)->isZero())
// Determine which value to split on, true, false, or undef if neither.
ConstantInt *SplitVal = nullptr;
if (NumTrue > NumFalse)
SplitVal = ConstantInt::getTrue(BB->getContext());
else if (NumTrue != 0 || NumFalse != 0)
SplitVal = ConstantInt::getFalse(BB->getContext());
// Collect all of the blocks that this can be folded into so that we can
// factor this once and clone it once.
SmallVector<BasicBlock*, 8> BlocksToFoldInto;
for (const auto &XorOpValue : XorOpValues) {
if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
// If we inferred a value for all of the predecessors, then duplication won't
// help us. However, we can just replace the LHS or RHS with the constant.
if (BlocksToFoldInto.size() ==
cast<PHINode>(BB->front()).getNumIncomingValues()) {
if (!SplitVal) {
// If all preds provide undef, just nuke the xor, because it is undef too.
} else if (SplitVal->isZero()) {
// If all preds provide 0, replace the xor with the other input.
} else {
// If all preds provide 1, set the computed value to 1.
BO->setOperand(!isLHS, SplitVal);
return true;
+ // If any of predecessors end with an indirect goto, we can't change its
+ // destination. Same for CallBr.
+ if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) {
+ return isa<IndirectBrInst>(Pred->getTerminator()) ||
+ isa<CallBrInst>(Pred->getTerminator());
+ }))
+ return false;
// Try to duplicate BB into PredBB.
return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
/// predecessor to the PHIBB block. If it has PHI nodes, add entries for
/// NewPred using the entries from OldPred (suitably mapped).
static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
BasicBlock *OldPred,
BasicBlock *NewPred,
DenseMap<Instruction*, Value*> &ValueMap) {
for (PHINode &PN : PHIBB->phis()) {
// Ok, we have a PHI node. Figure out what the incoming value was for the
// DestBlock.
Value *IV = PN.getIncomingValueForBlock(OldPred);
// Remap the value if necessary.
if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
if (I != ValueMap.end())
IV = I->second;
PN.addIncoming(IV, NewPred);
/// Merge basic block BB into its sole predecessor if possible.
bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
BasicBlock *SinglePred = BB->getSinglePredecessor();
if (!SinglePred)
return false;
const Instruction *TI = SinglePred->getTerminator();
if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 ||
SinglePred == BB || hasAddressTakenAndUsed(BB))
return false;
// If SinglePred was a loop header, BB becomes one.
if (LoopHeaders.erase(SinglePred))
MergeBasicBlockIntoOnlyPred(BB, DTU);
// Now that BB is merged into SinglePred (i.e. SinglePred code followed by
// BB code within one basic block `BB`), we need to invalidate the LVI
// information associated with BB, because the LVI information need not be
// true for all of BB after the merge. For example,
// Before the merge, LVI info and code is as follows:
// SinglePred: <LVI info1 for %p val>
// %y = use of %p
// call @exit() // need not transfer execution to successor.
// assume(%p) // from this point on %p is true
// br label %BB
// BB: <LVI info2 for %p val, i.e. %p is true>
// %x = use of %p
// br label exit
// Note that this LVI info for blocks BB and SinglPred is correct for %p
// (info2 and info1 respectively). After the merge and the deletion of the
// LVI info1 for SinglePred. We have the following code:
// BB: <LVI info2 for %p val>
// %y = use of %p
// call @exit()
// assume(%p)
// %x = use of %p <-- LVI info2 is correct from here onwards.
// br label exit
// LVI info2 for BB is incorrect at the beginning of BB.
// Invalidate LVI information for BB if the LVI is not provably true for
// all of BB.
if (!isGuaranteedToTransferExecutionToSuccessor(BB))
return true;
/// Update the SSA form. NewBB contains instructions that are copied from BB.
/// ValueMapping maps old values in BB to new ones in NewBB.
void JumpThreadingPass::UpdateSSA(
BasicBlock *BB, BasicBlock *NewBB,
DenseMap<Instruction *, Value *> &ValueMapping) {
// If there were values defined in BB that are used outside the block, then we
// now have to update all uses of the value to use either the original value,
// the cloned value, or some PHI derived value. This can require arbitrary
// PHI insertion, of which we are prepared to do, clean these up now.
SSAUpdater SSAUpdate;
SmallVector<Use *, 16> UsesToRename;
for (Instruction &I : *BB) {
// Scan all uses of this instruction to see if it is used outside of its
// block, and if so, record them in UsesToRename.
for (Use &U : I.uses()) {
Instruction *User = cast<Instruction>(U.getUser());
if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
if (UserPN->getIncomingBlock(U) == BB)
} else if (User->getParent() == BB)
// If there are no uses outside the block, we're done with this instruction.
if (UsesToRename.empty())
LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
// We found a use of I outside of BB. Rename all uses of I that are outside
// its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
// with the two values we know.
SSAUpdate.Initialize(I.getType(), I.getName());
SSAUpdate.AddAvailableValue(BB, &I);
SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
while (!UsesToRename.empty())
LLVM_DEBUG(dbgs() << "\n");
/// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone
/// arguments that come from PredBB. Return the map from the variables in the
/// source basic block to the variables in the newly created basic block.
DenseMap<Instruction *, Value *>
JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
BasicBlock::iterator BE, BasicBlock *NewBB,
BasicBlock *PredBB) {
// We are going to have to map operands from the source basic block to the new
// copy of the block 'NewBB'. If there are PHI nodes in the source basic
// block, evaluate them to account for entry from PredBB.
DenseMap<Instruction *, Value *> ValueMapping;
// Clone the phi nodes of the source basic block into NewBB. The resulting
// phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater
// might need to rewrite the operand of the cloned phi.
for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB);
NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB);
ValueMapping[PN] = NewPN;
// Clone the non-phi instructions of the source basic block into NewBB,
// keeping track of the mapping and using it to remap operands in the cloned
// instructions.
for (; BI != BE; ++BI) {
Instruction *New = BI->clone();
ValueMapping[&*BI] = New;
// Remap operands to patch up intra-block references.
for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst);
if (I != ValueMapping.end())
New->setOperand(i, I->second);
return ValueMapping;
/// Attempt to thread through two successive basic blocks.
bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
Value *Cond) {
// Consider:
// PredBB:
// %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ]
// %tobool = icmp eq i32 %cond, 0
// br i1 %tobool, label %BB, label ...
// BB:
// %cmp = icmp eq i32* %var, null
// br i1 %cmp, label ..., label ...
// We don't know the value of %var at BB even if we know which incoming edge
// we take to BB. However, once we duplicate PredBB for each of its incoming
// edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of
// PredBB. Then we can thread edges PredBB1->BB and PredBB2->BB through BB.
// Require that BB end with a Branch for simplicity.
BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
if (!CondBr)
return false;
// BB must have exactly one predecessor.
BasicBlock *PredBB = BB->getSinglePredecessor();
if (!PredBB)
return false;
// Require that PredBB end with a conditional Branch. If PredBB ends with an
// unconditional branch, we should be merging PredBB and BB instead. For
// simplicity, we don't deal with a switch.
BranchInst *PredBBBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
if (!PredBBBranch || PredBBBranch->isUnconditional())
return false;
// If PredBB has exactly one incoming edge, we don't gain anything by copying
// PredBB.
if (PredBB->getSinglePredecessor())
return false;
// Don't thread through PredBB if it contains a successor edge to itself, in
// which case we would infinite loop. Suppose we are threading an edge from
// PredPredBB through PredBB and BB to SuccBB with PredBB containing a
// successor edge to itself. If we allowed jump threading in this case, we
// could duplicate PredBB and BB as, say, PredBB.thread and BB.thread. Since
// PredBB.thread has a successor edge to PredBB, we would immediately come up
// with another jump threading opportunity from PredBB.thread through PredBB
// and BB to SuccBB. This jump threading would repeatedly occur. That is, we
// would keep peeling one iteration from PredBB.
if (llvm::is_contained(successors(PredBB), PredBB))
return false;
// Don't thread across a loop header.
if (LoopHeaders.count(PredBB))
return false;
// Avoid complication with duplicating EH pads.
if (PredBB->isEHPad())
return false;
// Find a predecessor that we can thread. For simplicity, we only consider a
// successor edge out of BB to which we thread exactly one incoming edge into
// PredBB.
unsigned ZeroCount = 0;
unsigned OneCount = 0;
BasicBlock *ZeroPred = nullptr;
BasicBlock *OnePred = nullptr;
for (BasicBlock *P : predecessors(PredBB)) {
if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
EvaluateOnPredecessorEdge(BB, P, Cond))) {
if (CI->isZero()) {
ZeroPred = P;
} else if (CI->isOne()) {
OnePred = P;
// Disregard complicated cases where we have to thread multiple edges.
BasicBlock *PredPredBB;
if (ZeroCount == 1) {
PredPredBB = ZeroPred;
} else if (OneCount == 1) {
PredPredBB = OnePred;
} else {
return false;
BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred);
// If threading to the same block as we come from, we would infinite loop.
if (SuccBB == BB) {
LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
<< "' - would thread to self!\n");
return false;
// If threading this would thread across a loop header, don't thread the edge.
// See the comments above FindLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
bool BBIsHeader = LoopHeaders.count(BB);
bool SuccIsHeader = LoopHeaders.count(SuccBB);
dbgs() << " Not threading across "
<< (BBIsHeader ? "loop header BB '" : "block BB '")
<< BB->getName() << "' to dest "
<< (SuccIsHeader ? "loop header BB '" : "block BB '")
<< SuccBB->getName()
<< "' - it might create an irreducible loop!\n";
return false;
// Compute the cost of duplicating BB and PredBB.
unsigned BBCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
unsigned PredBBCost = getJumpThreadDuplicationCost(
PredBB, PredBB->getTerminator(), BBDupThreshold);
// Give up if costs are too high. We need to check BBCost and PredBBCost
// individually before checking their sum because getJumpThreadDuplicationCost
// return (unsigned)~0 for those basic blocks that cannot be duplicated.
if (BBCost > BBDupThreshold || PredBBCost > BBDupThreshold ||
BBCost + PredBBCost > BBDupThreshold) {
LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
<< "' - Cost is too high: " << PredBBCost
<< " for PredBB, " << BBCost << "for BB\n");
return false;
// Now we are ready to duplicate PredBB.
ThreadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
return true;
void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
BasicBlock *PredBB,
BasicBlock *BB,
BasicBlock *SuccBB) {
LLVM_DEBUG(dbgs() << " Threading through '" << PredBB->getName() << "' and '"
<< BB->getName() << "'\n");
BranchInst *CondBr = cast<BranchInst>(BB->getTerminator());
BranchInst *PredBBBranch = cast<BranchInst>(PredBB->getTerminator());
BasicBlock *NewBB =
BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread",
PredBB->getParent(), PredBB);
// Set the block frequency of NewBB.
if (HasProfileData) {
auto NewBBFreq = BFI->getBlockFreq(PredPredBB) *
BPI->getEdgeProbability(PredPredBB, PredBB);
BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
// We are going to have to map operands from the original BB block to the new
// copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them
// to account for entry from PredPredBB.
DenseMap<Instruction *, Value *> ValueMapping =
CloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
// Update the terminator of PredPredBB to jump to NewBB instead of PredBB.
// This eliminates predecessors from PredPredBB, which requires us to simplify
// any PHI nodes in PredBB.
Instruction *PredPredTerm = PredPredBB->getTerminator();
for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i)
if (PredPredTerm->getSuccessor(i) == PredBB) {
PredBB->removePredecessor(PredPredBB, true);
PredPredTerm->setSuccessor(i, NewBB);
AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
{{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)},
{DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)},
{DominatorTree::Insert, PredPredBB, NewBB},
{DominatorTree::Delete, PredPredBB, PredBB}});
UpdateSSA(PredBB, NewBB, ValueMapping);
// Clean up things like PHI nodes with single operands, dead instructions,
// etc.
SimplifyInstructionsInBlock(NewBB, TLI);
SimplifyInstructionsInBlock(PredBB, TLI);
SmallVector<BasicBlock *, 1> PredsToFactor;
ThreadEdge(BB, PredsToFactor, SuccBB);
/// TryThreadEdge - Thread an edge if it's safe and profitable to do so.
bool JumpThreadingPass::TryThreadEdge(
BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
BasicBlock *SuccBB) {
// If threading to the same block as we come from, we would infinite loop.
if (SuccBB == BB) {
LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
<< "' - would thread to self!\n");
return false;
// If threading this would thread across a loop header, don't thread the edge.
// See the comments above FindLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
bool BBIsHeader = LoopHeaders.count(BB);
bool SuccIsHeader = LoopHeaders.count(SuccBB);
dbgs() << " Not threading across "
<< (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName()
<< "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '")
<< SuccBB->getName() << "' - it might create an irreducible loop!\n";
return false;
unsigned JumpThreadCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (JumpThreadCost > BBDupThreshold) {
LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
<< "' - Cost is too high: " << JumpThreadCost << "\n");
return false;
ThreadEdge(BB, PredBBs, SuccBB);
return true;
/// ThreadEdge - We have decided that it is safe and profitable to factor the
/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
/// across BB. Transform the IR to reflect this change.
void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
const SmallVectorImpl<BasicBlock *> &PredBBs,
BasicBlock *SuccBB) {
assert(SuccBB != BB && "Don't create an infinite loop");
assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) &&
"Don't thread across loop headers");
// And finally, do it! Start by factoring the predecessors if needed.
BasicBlock *PredBB;
if (PredBBs.size() == 1)
PredBB = PredBBs[0];
else {
LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
<< " common predecessors.\n");
PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
// And finally, do it!
LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName()
<< "' to '" << SuccBB->getName()
<< ", across block:\n " << *BB << "\n");
LVI->threadEdge(PredBB, BB, SuccBB);
BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
BB->getParent(), BB);
// Set the block frequency of NewBB.
if (HasProfileData) {
auto NewBBFreq =
BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
// Copy all the instructions from BB to NewBB except the terminator.
DenseMap<Instruction *, Value *> ValueMapping =
CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
// We didn't copy the terminator from BB over to NewBB, because there is now
// an unconditional jump to SuccBB. Insert the unconditional jump.
BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
// Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
// PHI nodes for NewBB now.
AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
// Update the terminator of PredBB to jump to NewBB instead of BB. This
// eliminates predecessors from BB, which requires us to simplify any PHI
// nodes in BB.
Instruction *PredTerm = PredBB->getTerminator();
for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
if (PredTerm->getSuccessor(i) == BB) {
BB->removePredecessor(PredBB, true);
PredTerm->setSuccessor(i, NewBB);
// Enqueue required DT updates.
DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, SuccBB},
{DominatorTree::Insert, PredBB, NewBB},
{DominatorTree::Delete, PredBB, BB}});
UpdateSSA(BB, NewBB, ValueMapping);
// At this point, the IR is fully up to date and consistent. Do a quick scan
// over the new instructions and zap any that are constants or dead. This
// frequently happens because of phi translation.
SimplifyInstructionsInBlock(NewBB, TLI);
// Update the edge weight from BB to SuccBB, which should be less than before.
UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
// Threaded an edge!
/// Create a new basic block that will be the predecessor of BB and successor of
/// all blocks in Preds. When profile data is available, update the frequency of
/// this new block.
BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
ArrayRef<BasicBlock *> Preds,
const char *Suffix) {
SmallVector<BasicBlock *, 2> NewBBs;
// Collect the frequencies of all predecessors of BB, which will be used to
// update the edge weight of the result of splitting predecessors.
DenseMap<BasicBlock *, BlockFrequency> FreqMap;
if (HasProfileData)
for (auto Pred : Preds)
Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB)));
// In the case when BB is a LandingPad block we create 2 new predecessors
// instead of just one.
if (BB->isLandingPad()) {
std::string NewName = std::string(Suffix) + ".split-lp";
SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs);
} else {
NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix));
std::vector<DominatorTree::UpdateType> Updates;
Updates.reserve((2 * Preds.size()) + NewBBs.size());
for (auto NewBB : NewBBs) {
BlockFrequency NewBBFreq(0);
Updates.push_back({DominatorTree::Insert, NewBB, BB});
for (auto Pred : predecessors(NewBB)) {
Updates.push_back({DominatorTree::Delete, Pred, BB});
Updates.push_back({DominatorTree::Insert, Pred, NewBB});
if (HasProfileData) // Update frequencies between Pred -> NewBB.
NewBBFreq += FreqMap.lookup(Pred);
if (HasProfileData) // Apply the summed frequency to NewBB.
BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
return NewBBs[0];
bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
const Instruction *TI = BB->getTerminator();
assert(TI->getNumSuccessors() > 1 && "not a split");
MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
if (!WeightsNode)
return false;
MDString *MDName = cast<MDString>(WeightsNode->getOperand(0));
if (MDName->getString() != "branch_weights")
return false;
// Ensure there are weights for all of the successors. Note that the first
// operand to the metadata node is a name, not a weight.
return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1;
/// Update the block frequency of BB and branch weight and the metadata on the
/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
/// Freq(PredBB->BB) / Freq(BB->SuccBB).
void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
BasicBlock *BB,
BasicBlock *NewBB,
BasicBlock *SuccBB) {
if (!HasProfileData)
assert(BFI && BPI && "BFI & BPI should have been created here");
// As the edge from PredBB to BB is deleted, we have to update the block
// frequency of BB.
auto BBOrigFreq = BFI->getBlockFreq(BB);
auto NewBBFreq = BFI->getBlockFreq(NewBB);
auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
auto BBNewFreq = BBOrigFreq - NewBBFreq;
BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
// Collect updated outgoing edges' frequencies from BB and use them to update
// edge probabilities.
SmallVector<uint64_t, 4> BBSuccFreq;
for (BasicBlock *Succ : successors(BB)) {
auto SuccFreq = (Succ == SuccBB)
? BB2SuccBBFreq - NewBBFreq
: BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
uint64_t MaxBBSuccFreq =
*std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
SmallVector<BranchProbability, 4> BBSuccProbs;
if (MaxBBSuccFreq == 0)
{1, static_cast<unsigned>(BBSuccFreq.size())});
else {
for (uint64_t Freq : BBSuccFreq)
BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
// Normalize edge probabilities so that they sum up to one.
// Update edge probabilities in BPI.
BPI->setEdgeProbability(BB, BBSuccProbs);
// Update the profile metadata as well.
// Don't do this if the profile of the transformed blocks was statically
// estimated. (This could occur despite the function having an entry
// frequency in completely cold parts of the CFG.)
// In this case we don't want to suggest to subsequent passes that the
// calculated weights are fully consistent. Consider this graph:
// check_1
// 50% / |
// eq_1 | 50%
// \ |
// check_2
// 50% / |
// eq_2 | 50%
// \ |
// check_3
// 50% / |
// eq_3 | 50%
// \ |
// Assuming the blocks check_* all compare the same value against 1, 2 and 3,
// the overall probabilities are inconsistent; the total probability that the
// value is either 1, 2 or 3 is 150%.
// As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3
// becomes 0%. This is even worse if the edge whose probability becomes 0% is
// the loop exit edge. Then based solely on static estimation we would assume
// the loop was extremely hot.
// FIXME this locally as well so that BPI and BFI are consistent as well. We
// shouldn't make edges extremely likely or unlikely based solely on static
// estimation.
if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) {
SmallVector<uint32_t, 4> Weights;
for (auto Prob : BBSuccProbs)
auto TI = BB->getTerminator();
/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
/// If we can duplicate the contents of BB up into PredBB do so now, this
/// improves the odds that the branch will be on an analyzable instruction like
/// a compare.
bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
assert(!PredBBs.empty() && "Can't handle an empty set");
// If BB is a loop header, then duplicating this block outside the loop would
// cause us to transform this into an irreducible loop, don't do this.
// See the comments above FindLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB)) {
LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
<< "' into predecessor block '" << PredBBs[0]->getName()
<< "' - it might create an irreducible loop!\n");
return false;
unsigned DuplicationCost =
getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (DuplicationCost > BBDupThreshold) {
LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
<< "' - Cost is too high: " << DuplicationCost << "\n");
return false;
// And finally, do it! Start by factoring the predecessors if needed.
std::vector<DominatorTree::UpdateType> Updates;
BasicBlock *PredBB;
if (PredBBs.size() == 1)
PredBB = PredBBs[0];
else {
LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
<< " common predecessors.\n");
PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
Updates.push_back({DominatorTree::Delete, PredBB, BB});
// Okay, we decided to do this! Clone all the instructions in BB onto the end
// of PredBB.
LLVM_DEBUG(dbgs() << " Duplicating block '" << BB->getName()
<< "' into end of '" << PredBB->getName()
<< "' to eliminate branch on phi. Cost: "
<< DuplicationCost << " block is:" << *BB << "\n");
// Unless PredBB ends with an unconditional branch, split the edge so that we
// can just clone the bits from BB into the end of the new PredBB.
BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
BasicBlock *OldPredBB = PredBB;
PredBB = SplitEdge(OldPredBB, BB);
Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
Updates.push_back({DominatorTree::Insert, PredBB, BB});
Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
// We are going to have to map operands from the original BB block into the
// PredBB block. Evaluate PHI nodes in BB.
DenseMap<Instruction*, Value*> ValueMapping;
BasicBlock::iterator BI = BB->begin();
for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
// Clone the non-phi instructions of BB into PredBB, keeping track of the
// mapping and using it to remap operands in the cloned instructions.
for (; BI != BB->end(); ++BI) {
Instruction *New = BI->clone();
// Remap operands to patch up intra-block references.
for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
if (I != ValueMapping.end())
New->setOperand(i, I->second);
// If this instruction can be simplified after the operands are updated,
// just use the simplified value instead. This frequently happens due to
// phi translation.
if (Value *IV = SimplifyInstruction(
{BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
ValueMapping[&*BI] = IV;
if (!New->mayHaveSideEffects()) {
New = nullptr;
} else {
ValueMapping[&*BI] = New;
if (New) {
// Otherwise, insert the new instruction into the block.
PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
// Update Dominance from simplified New instruction operands.
for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i)))
Updates.push_back({DominatorTree::Insert, PredBB, SuccBB});
// Check to see if the targets of the branch had PHI nodes. If so, we need to
// add entries to the PHI nodes for branch from PredBB now.
BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
UpdateSSA(BB, PredBB, ValueMapping);
// PredBB no longer jumps to BB, remove entries in the PHI node for the edge
// that we nuked.
BB->removePredecessor(PredBB, true);
// Remove the unconditional branch at the end of the PredBB block.
return true;
// Pred is a predecessor of BB with an unconditional branch to BB. SI is
// a Select instruction in Pred. BB has other predecessors and SI is used in
// a PHI node in BB. SI has no other use.
// A new basic block, NewBB, is created and SI is converted to compare and
// conditional branch. SI is erased from parent.
void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
SelectInst *SI, PHINode *SIUse,
unsigned Idx) {
// Expand the select.
// Pred --
// | v
// | NewBB
// | |
// |-----
// v
// BB
BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
BB->getParent(), BB);
// Move the unconditional branch to NewBB.
NewBB->getInstList().insert(NewBB->end(), PredTerm);
// Create a conditional branch and update PHI nodes.
BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
SIUse->setIncomingValue(Idx, SI->getFalseValue());
SIUse->addIncoming(SI->getTrueValue(), NewBB);
// The select is now dead.
DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, BB},
{DominatorTree::Insert, Pred, NewBB}});
// Update any other PHI nodes in BB.
for (BasicBlock::iterator BI = BB->begin();
PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
if (Phi != SIUse)
Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
bool JumpThreadingPass::TryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
PHINode *CondPHI = dyn_cast<PHINode>(SI->getCondition());
if (!CondPHI || CondPHI->getParent() != BB)
return false;
for (unsigned I = 0, E = CondPHI->getNumIncomingValues(); I != E; ++I) {
BasicBlock *Pred = CondPHI->getIncomingBlock(I);
SelectInst *PredSI = dyn_cast<SelectInst>(CondPHI->getIncomingValue(I));
// The second and third condition can be potentially relaxed. Currently
// the conditions help to simplify the code and allow us to reuse existing
// code, developed for TryToUnfoldSelect(CmpInst *, BasicBlock *)
if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse())
BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
if (!PredTerm || !PredTerm->isUnconditional())
UnfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
return true;
return false;
/// TryToUnfoldSelect - Look for blocks of the form
/// bb1:
/// %a = select
/// br bb2
/// bb2:
/// %p = phi [%a, %bb1] ...
/// %c = icmp %p
/// br i1 %c
/// And expand the select into a branch structure if one of its arms allows %c
/// to be folded. This later enables threading from bb1 over bb2.
bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
if (!CondBr || !CondBr->isConditional() || !CondLHS ||
CondLHS->getParent() != BB)
return false;
for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
BasicBlock *Pred = CondLHS->getIncomingBlock(I);
SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
// Look if one of the incoming values is a select in the corresponding
// predecessor.
if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
if (!PredTerm || !PredTerm->isUnconditional())
// Now check if one of the select values would allow us to constant fold the
// terminator in BB. We don't do the transform if both sides fold, those
// cases will be threaded in any case.
LazyValueInfo::Tristate LHSFolds =
LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
CondRHS, Pred, BB, CondCmp);
LazyValueInfo::Tristate RHSFolds =
LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
CondRHS, Pred, BB, CondCmp);
if ((LHSFolds != LazyValueInfo::Unknown ||
RHSFolds != LazyValueInfo::Unknown) &&
LHSFolds != RHSFolds) {
UnfoldSelectInstr(Pred, BB, SI, CondLHS, I);
return true;
return false;
/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
/// same BB in the form
/// bb:
/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
/// %s = select %p, trueval, falseval
/// or
/// bb:
/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
/// %c = cmp %p, 0
/// %s = select %c, trueval, falseval
/// And expand the select into a branch structure. This later enables
/// jump-threading over bb in this pass.
/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
/// select if the associated PHI has at least one constant. If the unfolded
/// select is not jump-threaded, it will be folded again in the later
/// optimizations.
bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
// This transform can introduce a UB (a conditional branch that depends on a
// poison value) that was not present in the original program. See
// @TryToUnfoldSelectInCurrBB test in test/Transforms/JumpThreading/select.ll.
// Disable this transform under MemorySanitizer.
// FIXME: either delete it or replace with a valid transform. This issue is
// not limited to MemorySanitizer (but has only been observed as an MSan false
// positive in practice so far).
if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
return false;
// If threading this would thread across a loop header, don't thread the edge.
// See the comments above FindLoopHeaders for justifications and caveats.
if (LoopHeaders.count(BB))
return false;
for (BasicBlock::iterator BI = BB->begin();
PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
// Look for a Phi having at least one constant incoming value.
if (llvm::all_of(PN->incoming_values(),
[](Value *V) { return !isa<ConstantInt>(V); }))
auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
// Check if SI is in BB and use V as condition.
if (SI->getParent() != BB)
return false;
Value *Cond = SI->getCondition();
return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
SelectInst *SI = nullptr;
for (Use &U : PN->uses()) {
if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
// Look for a ICmp in BB that compares PN with a constant and is the
// condition of a Select.
if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
SI = SelectI;
} else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
// Look for a Select in BB that uses PN as condition.
if (isUnfoldCandidate(SelectI, U.get())) {
SI = SelectI;
if (!SI)
// Expand the select.
Instruction *Term =
SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
BasicBlock *SplitBB = SI->getParent();
BasicBlock *NewBB = Term->getParent();
PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
NewPN->addIncoming(SI->getFalseValue(), BB);
// NewBB and SplitBB are newly created blocks which require insertion.
std::vector<DominatorTree::UpdateType> Updates;
Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3);
Updates.push_back({DominatorTree::Insert, BB, SplitBB});
Updates.push_back({DominatorTree::Insert, BB, NewBB});
Updates.push_back({DominatorTree::Insert, NewBB, SplitBB});
// BB's successors were moved to SplitBB, update DTU accordingly.
for (auto *Succ : successors(SplitBB)) {
Updates.push_back({DominatorTree::Delete, BB, Succ});
Updates.push_back({DominatorTree::Insert, SplitBB, Succ});
return true;
return false;
/// Try to propagate a guard from the current BB into one of its predecessors
/// in case if another branch of execution implies that the condition of this
/// guard is always true. Currently we only process the simplest case that
/// looks like:
/// Start:
/// %cond = ...
/// br i1 %cond, label %T1, label %F1
/// T1:
/// br label %Merge
/// F1:
/// br label %Merge
/// Merge:
/// %condGuard = ...
/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
/// And cond either implies condGuard or !condGuard. In this case all the
/// instructions before the guard can be duplicated in both branches, and the
/// guard is then threaded to one of them.
bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
using namespace PatternMatch;
// We only want to deal with two predecessors.
BasicBlock *Pred1, *Pred2;
auto PI = pred_begin(BB), PE = pred_end(BB);
if (PI == PE)
return false;
Pred1 = *PI++;
if (PI == PE)
return false;
Pred2 = *PI++;
if (PI != PE)
return false;
if (Pred1 == Pred2)
return false;
// Try to thread one of the guards of the block.
// TODO: Look up deeper than to immediate predecessor?
auto *Parent = Pred1->getSinglePredecessor();
if (!Parent || Parent != Pred2->getSinglePredecessor())
return false;
if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
for (auto &I : *BB)
if (isGuard(&I) && ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
return true;
return false;
/// Try to propagate the guard from BB which is the lower block of a diamond
/// to one of its branches, in case if diamond's condition implies guard's
/// condition.
bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
BranchInst *BI) {
assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
assert(BI->isConditional() && "Unconditional branch has 2 successors?");
Value *GuardCond = Guard->getArgOperand(0);
Value *BranchCond = BI->getCondition();
BasicBlock *TrueDest = BI->getSuccessor(0);
BasicBlock *FalseDest = BI->getSuccessor(1);
auto &DL = BB->getModule()->getDataLayout();
bool TrueDestIsSafe = false;
bool FalseDestIsSafe = false;
// True dest is safe if BranchCond => GuardCond.
auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
if (Impl && *Impl)
TrueDestIsSafe = true;
else {
// False dest is safe if !BranchCond => GuardCond.
Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false);
if (Impl && *Impl)
FalseDestIsSafe = true;
if (!TrueDestIsSafe && !FalseDestIsSafe)
return false;
BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
ValueToValueMapTy UnguardedMapping, GuardedMapping;
Instruction *AfterGuard = Guard->getNextNode();
unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
if (Cost > BBDupThreshold)
return false;
// Duplicate all instructions before the guard and the guard itself to the
// branch where implication is not proved.
BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU);
assert(GuardedBlock && "Could not create the guarded block?");
// Duplicate all instructions before the guard in the unguarded branch.
// Since we have successfully duplicated the guarded block and this block
// has fewer instructions, we expect it to succeed.
BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU);
assert(UnguardedBlock && "Could not create the unguarded block?");
LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
<< GuardedBlock->getName() << "\n");
// Some instructions before the guard may still have uses. For them, we need
// to create Phi nodes merging their copies in both guarded and unguarded
// branches. Those instructions that have no uses can be just removed.
SmallVector<Instruction *, 4> ToRemove;
for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
if (!isa<PHINode>(&*BI))
Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
assert(InsertionPoint && "Empty block?");
// Substitute with Phis & remove.
for (auto *Inst : reverse(ToRemove)) {
if (!Inst->use_empty()) {
PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
return true;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5bc35aa4695f..f950d0d4eb2b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1,7636 +1,7645 @@
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
using namespace llvm::PatternMatch;
using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
cl::desc("Run the SLP vectorization passes"));
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
"number "));
static cl::opt<bool>
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
"Attempt to vectorize horizontal reductions feeding into a store"));
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<int>
MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
cl::desc("Maximum depth of the lookup for consecutive stores."));
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.
static cl::opt<int>
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
cl::desc("Limit the size of the SLP scheduling region per block"));
static cl::opt<int> MinVectorRegSizeOption(
"slp-min-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<unsigned> RecursionMaxDepth(
"slp-recursion-max-depth", cl::init(12), cl::Hidden,
cl::desc("Limit the recursion depth when building a vectorizable tree"));
static cl::opt<unsigned> MinTreeSize(
"slp-min-tree-size", cl::init(3), cl::Hidden,
cl::desc("Only vectorize small trees if they are fully vectorizable"));
// The maximum depth that the look-ahead score heuristic will explore.
// The higher this value, the higher the compilation time overhead.
static cl::opt<int> LookAheadMaxDepth(
"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
cl::desc("The maximum look-ahead depth for operand reordering scores"));
// The Look-ahead heuristic goes through the users of the bundle to calculate
// the users cost in getExternalUsesCost(). To avoid compilation time increase
// we limit the number of users visited to this value.
static cl::opt<unsigned> LookAheadUsersBudget(
"slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
cl::desc("The maximum number of users to visit while visiting the "
"predecessors. This prevents compilation time increase."));
static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
// Another limit for the alias checks: The maximum distance between load/store
// instructions where alias checks are done.
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
/// Predicate for the element types that the SLP vectorizer supports.
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
/// avoids spending time checking the cost model and realizing that they will
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
BasicBlock *BB = I0->getParent();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I)
return false;
if (BB != I->getParent())
return false;
return true;
/// \returns True if all of the values in \p VL are constants (but not
/// globals/constant expressions).
static bool allConstant(ArrayRef<Value *> VL) {
// Constant expressions and globals can't be vectorized like normal integer/FP
// constants.
for (Value *i : VL)
if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
return false;
return true;
/// \returns True if all of the values in \p VL are identical.
static bool isSplat(ArrayRef<Value *> VL) {
for (unsigned i = 1, e = VL.size(); i < e; ++i)
if (VL[i] != VL[0])
return false;
return true;
/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
static bool isCommutative(Instruction *I) {
if (auto *IC = dyn_cast<CmpInst>(I))
return IC->isCommutative();
return I->isCommutative();
/// Checks if the vector of instructions can be represented as a shuffle, like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %x0x0 = mul i8 %x0, %x0
/// %x3x3 = mul i8 %x3, %x3
/// %y1y1 = mul i8 %y1, %y1
/// %y2y2 = mul i8 %y2, %y2
/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
/// ret <4 x i8> %ins4
/// can be transformed into:
/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
/// i32 6>
/// %2 = mul <4 x i8> %1, %1
/// ret <4 x i8> %2
/// We convert this initially to something like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
/// %5 = mul <4 x i8> %4, %4
/// %6 = extractelement <4 x i8> %5, i32 0
/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
/// %7 = extractelement <4 x i8> %5, i32 1
/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
/// %8 = extractelement <4 x i8> %5, i32 2
/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
/// %9 = extractelement <4 x i8> %5, i32 3
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
/// ret <4 x i8> %ins4
/// InstCombiner transforms this into a shuffle and vector mul
/// TODO: Can we split off and reuse the shuffle mask detection from
/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
unsigned Size = EI0->getVectorOperandType()->getNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
ShuffleMode CommonShuffleMode = Unknown;
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
auto *EI = cast<ExtractElementInst>(VL[I]);
auto *Vec = EI->getVectorOperand();
// All vector operands must have the same number of vector elements.
if (cast<VectorType>(Vec->getType())->getNumElements() != Size)
return None;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
return None;
// Undefined behavior if Idx is negative or >= Size.
if (Idx->getValue().uge(Size))
unsigned IntIdx = Idx->getValue().getZExtValue();
// We can extractelement from undef vector.
if (isa<UndefValue>(Vec))
// For correct shuffling we have to have at most 2 different vector operands
// in all extractelement instructions.
if (!Vec1 || Vec1 == Vec)
Vec1 = Vec;
else if (!Vec2 || Vec2 == Vec)
Vec2 = Vec;
return None;
if (CommonShuffleMode == Permute)
// If the extract index is not the same as the operation number, it is a
// permutation.
if (IntIdx != I) {
CommonShuffleMode = Permute;
CommonShuffleMode = Select;
// If we're not crossing lanes in different vectors, consider it as blending.
if (CommonShuffleMode == Select && Vec2)
return TargetTransformInfo::SK_Select;
// If Vec2 was never used, we have a permutation of a single vector, otherwise
// we have permutation of 2 vectors.
return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
namespace {
/// Main data required for vectorization of instructions.
struct InstructionsState {
/// The very first instruction in the list with the main opcode.
Value *OpValue = nullptr;
/// The main/alternate instruction.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
/// The main/alternate opcodes for the list of instructions.
unsigned getOpcode() const {
return MainOp ? MainOp->getOpcode() : 0;
unsigned getAltOpcode() const {
return AltOp ? AltOp->getOpcode() : 0;
/// Some of the instructions in the list have alternate opcodes.
bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
InstructionsState() = delete;
InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
: OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
} // end anonymous namespace
/// Chooses the correct key for scheduling data. If \p Op has the same (or
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
/// OpValue.
static Value *isOneOf(const InstructionsState &S, Value *Op) {
auto *I = dyn_cast<Instruction>(Op);
if (I && S.isOpcodeOrAlt(I))
return Op;
return S.OpValue;
/// \returns true if \p Opcode is allowed as part of of the main/alternate
/// instruction for SLP vectorization.
/// Example of unsupported opcode is SDIV that can potentially cause UB if the
/// "shuffled out" lane would result in division by zero.
static bool isValidForAlternation(unsigned Opcode) {
if (Instruction::isIntDivRem(Opcode))
return false;
return true;
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
unsigned BaseIndex = 0) {
// Make sure these are all Instructions.
if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
unsigned AltOpcode = Opcode;
unsigned AltIndex = BaseIndex;
// Check for one alternate opcode from another BinaryOperator.
// TODO - generalize to support all operators (types, calls etc.).
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
isValidForAlternation(Opcode)) {
AltOpcode = InstOpcode;
AltIndex = Cnt;
} else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
if (Ty0 == Ty1) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
if (Opcode == AltOpcode) {
assert(isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) &&
"Cast isn't safe for alternation, logic needs to be updated!");
AltOpcode = InstOpcode;
AltIndex = Cnt;
} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
return InstructionsState(VL[BaseIndex], nullptr, nullptr);
return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
Type *Ty = VL[0]->getType();
for (int i = 1, e = VL.size(); i < e; i++)
if (VL[i]->getType() != Ty)
return false;
return true;
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
static Optional<unsigned> getExtractIndex(Instruction *E) {
unsigned Opcode = E->getOpcode();
assert((Opcode == Instruction::ExtractElement ||
Opcode == Instruction::ExtractValue) &&
"Expected extractelement or extractvalue instruction.");
if (Opcode == Instruction::ExtractElement) {
auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
if (!CI)
return None;
return CI->getZExtValue();
ExtractValueInst *EI = cast<ExtractValueInst>(E);
if (EI->getNumIndices() != 1)
return None;
return *EI->idx_begin();
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
TargetLibraryInfo *TLI) {
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
LoadInst *LI = cast<LoadInst>(UserInst);
return (LI->getPointerOperand() == Scalar);
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(UserInst);
return (SI->getPointerOperand() == Scalar);
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
if (hasVectorInstrinsicScalarOpd(ID, i))
return (CI->getArgOperand(i) == Scalar);
return false;
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return MemoryLocation::get(LI);
return MemoryLocation();
/// \returns True if the instruction is not a volatile or atomic load/store.
static bool isSimple(Instruction *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->isSimple();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return SI->isSimple();
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
return !MI->isVolatile();
return true;
namespace llvm {
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
struct TreeEntry;
struct ScheduleData;
using ValueList = SmallVector<Value *, 8>;
using InstrList = SmallVector<Instruction *, 16>;
using ValueSet = SmallPtrSet<Value *, 16>;
using StoreList = SmallVector<StoreInst *, 8>;
using ExtraValueToDebugLocsMap =
MapVector<Value *, SmallVector<Instruction *, 2>>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
// TODO: It would be better to limit the vectorization factor based on
// data type rather than just register size. For example, x86 AVX has
// 256-bit registers, but it does not support integer operations
// at that width (that requires AVX2).
if (MaxVectorRegSizeOption.getNumOccurrences())
MaxVecRegSize = MaxVectorRegSizeOption;
MaxVecRegSize = TTI->getRegisterBitWidth(true);
if (MinVectorRegSizeOption.getNumOccurrences())
MinVecRegSize = MinVectorRegSizeOption;
MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
/// Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
int getSpillCost() const;
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
int getTreeCost();
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
/// into account (and updating it, if required) list of externally used
/// values stored in \p ExternallyUsedValues.
void buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst = None);
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
NumOpsWantToKeepOriginalOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
unsigned getTreeSize() const { return VectorizableTree.size(); }
/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
/// \returns The best order of instructions for vectorization.
Optional<ArrayRef<unsigned>> bestOrder() const {
auto I = std::max_element(
NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
[](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
return D1.second < D2.second;
if (I == NumOpsWantToKeepOrder.end() ||
I->getSecond() <= NumOpsWantToKeepOriginalOrder)
return None;
return makeArrayRef(I->getFirst());
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
unsigned getVectorElementSize(Value *V);
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
void computeMinimumValueSizes();
// \returns maximum vector register size as set by TTI or overridden by cl::opt.
unsigned getMaxVecRegSize() const {
return MaxVecRegSize;
// \returns minimum vector register size as set by cl::opt.
unsigned getMinVecRegSize() const {
return MinVecRegSize;
/// Check if homogeneous aggregate is isomorphic to some VectorType.
/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable() const;
/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
/// can be load combined in the backend. Load combining may not be allowed in
/// the IR optimizer, so we do not want to alter the pattern. For example,
/// partially transforming a scalar bswap() pattern into vector code is
/// effectively impossible for the backend to undo.
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
/// can be load combined in the backend. Load combining may not be allowed in
/// the IR optimizer, so we do not want to alter the pattern. For example,
/// partially transforming a scalar bswap() pattern into vector code is
/// effectively impossible for the backend to undo.
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
bool isLoadCombineCandidate() const;
OptimizationRemarkEmitter *getORE() { return ORE; }
/// This structure holds any data we need about the edges being traversed
/// during buildTree_rec(). We keep track of:
/// (i) the user TreeEntry index, and
/// (ii) the index of the edge.
struct EdgeInfo {
EdgeInfo() = default;
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
: UserTE(UserTE), EdgeIdx(EdgeIdx) {}
/// The user TreeEntry.
TreeEntry *UserTE = nullptr;
/// The operand index of the use.
unsigned EdgeIdx = UINT_MAX;
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &OS,
const BoUpSLP::EdgeInfo &EI) {
return OS;
/// Debug print.
void dump(raw_ostream &OS) const {
OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
<< " EdgeIdx:" << EdgeIdx << "}";
LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
/// A helper data structure to hold the operands of a vector of instructions.
/// This supports a fixed vector length for all operand vectors.
class VLOperands {
/// For each operand we need (i) the value, and (ii) the opcode that it
/// would be attached to if the expression was in a left-linearized form.
/// This is required to avoid illegal operand reordering.
/// For example:
/// \verbatim
/// 0 Op1
/// |/
/// Op1 Op2 Linearized + Op2
/// \ / ----------> |/
/// - -
/// Op1 - Op2 (0 + Op1) - Op2
/// \endverbatim
/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
/// Another way to think of this is to track all the operations across the
/// path from the operand all the way to the root of the tree and to
/// calculate the operation that corresponds to this path. For example, the
/// path from Op2 to the root crosses the RHS of the '-', therefore the
/// corresponding operation is a '-' (which matches the one in the
/// linearized tree, as shown above).
/// For lack of a better term, we refer to this operation as Accumulated
/// Path Operation (APO).
struct OperandData {
OperandData() = default;
OperandData(Value *V, bool APO, bool IsUsed)
: V(V), APO(APO), IsUsed(IsUsed) {}
/// The operand value.
Value *V = nullptr;
/// TreeEntries only allow a single opcode, or an alternate sequence of
/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
/// APO. It is set to 'true' if 'V' is attached to an inverse operation
/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
/// (e.g., Add/Mul)
bool APO = false;
/// Helper data for the reordering function.
bool IsUsed = false;
/// During operand reordering, we are trying to select the operand at lane
/// that matches best with the operand at the neighboring lane. Our
/// selection is based on the type of value we are looking for. For example,
/// if the neighboring lane has a load, we need to look for a load that is
/// accessing a consecutive address. These strategies are summarized in the
/// 'ReorderingMode' enumerator.
enum class ReorderingMode {
Load, ///< Matching loads to consecutive memory addresses
Opcode, ///< Matching instructions based on opcode (same or alternate)
Constant, ///< Matching constants
Splat, ///< Matching the same instruction multiple times (broadcast)
Failed, ///< We failed to create a vectorizable group
using OperandDataVec = SmallVector<OperandData, 2>;
/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;
const DataLayout &DL;
ScalarEvolution &SE;
const BoUpSLP &R;
/// \returns the operand data at \p OpIdx and \p Lane.
OperandData &getData(unsigned OpIdx, unsigned Lane) {
return OpsVec[OpIdx][Lane];
/// \returns the operand data at \p OpIdx and \p Lane. Const version.
const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
return OpsVec[OpIdx][Lane];
/// Clears the used flag for all entries.
void clearUsed() {
for (unsigned OpIdx = 0, NumOperands = getNumOperands();
OpIdx != NumOperands; ++OpIdx)
for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
OpsVec[OpIdx][Lane].IsUsed = false;
/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
// The hard-coded scores listed here are not very important. When computing
// the scores of matching one sub-tree with another, we are basically
// counting the number of values that are matching. So even if all scores
// are set to 1, we would still get a decent matching result.
// However, sometimes we have to break ties. For example we may have to
// choose between matching loads vs matching opcodes. This is what these
// scores are helping us with: they provide the order of preference.
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreConsecutiveLoads = 3;
/// ExtractElementInst from same vector and consecutive indexes.
static const int ScoreConsecutiveExtracts = 3;
/// Constants.
static const int ScoreConstants = 2;
/// Instructions with the same opcode.
static const int ScoreSameOpcode = 2;
/// Instructions with alt opcodes (e.g, add + sub).
static const int ScoreAltOpcodes = 1;
/// Identical instructions (a.k.a. splat or broadcast).
static const int ScoreSplat = 1;
/// Matching with an undef is preferable to failing.
static const int ScoreUndef = 1;
/// Score for failing to find a decent match.
static const int ScoreFail = 0;
/// User exteranl to the vectorized code.
static const int ExternalUseCost = 1;
/// The user is internal but in a different lane.
static const int UserInDiffLaneCost = ExternalUseCost;
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
ScalarEvolution &SE) {
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
if (LI1 && LI2)
return isConsecutiveAccess(LI1, LI2, DL, SE)
? VLOperands::ScoreConsecutiveLoads
: VLOperands::ScoreFail;
auto *C1 = dyn_cast<Constant>(V1);
auto *C2 = dyn_cast<Constant>(V2);
if (C1 && C2)
return VLOperands::ScoreConstants;
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
Value *EV;
ConstantInt *Ex1Idx, *Ex2Idx;
if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
return VLOperands::ScoreConsecutiveExtracts;
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (I1 && I2) {
if (I1 == I2)
return VLOperands::ScoreSplat;
InstructionsState S = getSameOpcode({I1, I2});
// Note: Only consider instructions with <= 2 operands to avoid
// complexity explosion.
if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
: VLOperands::ScoreSameOpcode;
if (isa<UndefValue>(V2))
return VLOperands::ScoreUndef;
return VLOperands::ScoreFail;
/// Holds the values and their lane that are taking part in the look-ahead
/// score calculation. This is used in the external uses cost calculation.
SmallDenseMap<Value *, int> InLookAheadValues;
/// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
/// either external to the vectorized code, or require shuffling.
int getExternalUsesCost(const std::pair<Value *, int> &LHS,
const std::pair<Value *, int> &RHS) {
int Cost = 0;
std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
Value *V = Values[Idx].first;
// Calculate the absolute lane, using the minimum relative lane of LHS
// and RHS as base and Idx as the offset.
int Ln = std::min(LHS.second, RHS.second) + Idx;
assert(Ln >= 0 && "Bad lane calculation");
unsigned UsersBudget = LookAheadUsersBudget;
for (User *U : V->users()) {
if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
// The user is in the VectorizableTree. Check if we need to insert.
auto It = llvm::find(UserTE->Scalars, U);
assert(It != UserTE->Scalars.end() && "U is in UserTE");
int UserLn = std::distance(UserTE->Scalars.begin(), It);
assert(UserLn >= 0 && "Bad lane");
if (UserLn != Ln)
Cost += UserInDiffLaneCost;
} else {
// Check if the user is in the look-ahead code.
auto It2 = InLookAheadValues.find(U);
if (It2 != InLookAheadValues.end()) {
// The user is in the look-ahead code. Check the lane.
if (It2->second != Ln)
Cost += UserInDiffLaneCost;
} else {
// The user is neither in SLP tree nor in the look-ahead code.
Cost += ExternalUseCost;
// Limit the number of visited uses to cap compilation time.
if (--UsersBudget == 0)
return Cost;
/// Go through the operands of \p LHS and \p RHS recursively until \p
/// MaxLevel, and return the cummulative score. For example:
/// \verbatim
/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
/// \ / \ / \ / \ /
/// + + + +
/// G1 G2 G3 G4
/// \endverbatim
/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
/// each level recursively, accumulating the score. It starts from matching
/// the additions at level 0, then moves on to the loads (level 1). The
/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
/// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
/// {A[0],C[0]} has a score of VLOperands::ScoreFail.
/// Please note that the order of the operands does not matter, as we
/// evaluate the score of all profitable combinations of operands. In
/// other words the score of G1 and G4 is the same as G1 and G2. This
/// heuristic is based on ideas described in:
/// Look-ahead SLP: Auto-vectorization in the presence of commutative
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
/// Luís F. W. Góes
int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
const std::pair<Value *, int> &RHS, int CurrLevel,
int MaxLevel) {
Value *V1 = LHS.first;
Value *V2 = RHS.first;
// Get the shallow score of V1 and V2.
int ShallowScoreAtThisLevel =
std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
getExternalUsesCost(LHS, RHS));
int Lane1 = LHS.second;
int Lane2 = RHS.second;
// If reached MaxLevel,
// or if V1 and V2 are not instructions,
// or if they are SPLAT,
// or if they are not consecutive, early return the current cost.
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
(isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
return ShallowScoreAtThisLevel;
assert(I1 && I2 && "Should have early exited.");
// Keep track of in-tree values for determining the external-use cost.
InLookAheadValues[V1] = Lane1;
InLookAheadValues[V2] = Lane2;
// Contains the I2 operand indexes that got matched with I1 operands.
SmallSet<unsigned, 4> Op2Used;
// Recursion towards the operands of I1 and I2. We are trying all possbile
// operand pairs, and keeping track of the best score.
for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
OpIdx1 != NumOperands1; ++OpIdx1) {
// Try to pair op1I with the best operand of I2.
int MaxTmpScore = 0;
unsigned MaxOpIdx2 = 0;
bool FoundBest = false;
// If I2 is commutative try all combinations.
unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
unsigned ToIdx = isCommutative(I2)
? I2->getNumOperands()
: std::min(I2->getNumOperands(), OpIdx1 + 1);
assert(FromIdx <= ToIdx && "Bad index");
for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
// Skip operands already paired with OpIdx1.
if (Op2Used.count(OpIdx2))
// Recursively calculate the cost at each level
int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
{I2->getOperand(OpIdx2), Lane2},
CurrLevel + 1, MaxLevel);
// Look for the best score.
if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
MaxTmpScore = TmpScore;
MaxOpIdx2 = OpIdx2;
FoundBest = true;
if (FoundBest) {
// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
ShallowScoreAtThisLevel += MaxTmpScore;
return ShallowScoreAtThisLevel;
/// \Returns the look-ahead score, which tells us how much the sub-trees
/// rooted at \p LHS and \p RHS match, the more they match the higher the
/// score. This helps break ties in an informed way when we cannot decide on
/// the order of the operands by just considering the immediate
/// predecessors.
int getLookAheadScore(const std::pair<Value *, int> &LHS,
const std::pair<Value *, int> &RHS) {
return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
// Search all operands in Ops[*][Lane] for the one that matches best
// Ops[OpIdx][LastLane] and return its opreand index.
// If no good match can be found, return None.
getBestOperand(unsigned OpIdx, int Lane, int LastLane,
ArrayRef<ReorderingMode> ReorderingModes) {
unsigned NumOperands = getNumOperands();
// The operand of the previous lane at OpIdx.
Value *OpLastLane = getData(OpIdx, LastLane).V;
// Our strategy mode for OpIdx.
ReorderingMode RMode = ReorderingModes[OpIdx];
// The linearized opcode of the operand at OpIdx, Lane.
bool OpIdxAPO = getData(OpIdx, Lane).APO;
// The best operand index and its score.
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
// are using the score to differentiate between the two.
struct BestOpData {
Optional<unsigned> Idx = None;
unsigned Score = 0;
} BestOp;
// Iterate through all unused operands and look for the best.
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
// Get the operand at Idx and Lane.
OperandData &OpData = getData(Idx, Lane);
Value *Op = OpData.V;
bool OpAPO = OpData.APO;
// Skip already selected operands.
if (OpData.IsUsed)
// Skip if we are trying to move the operand to a position with a
// different opcode in the linearized tree form. This would break the
// semantics.
if (OpAPO != OpIdxAPO)
// Look for an operand that matches the current mode.
switch (RMode) {
case ReorderingMode::Load:
case ReorderingMode::Constant:
case ReorderingMode::Opcode: {
bool LeftToRight = Lane > LastLane;
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
unsigned Score =
getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
if (Score > BestOp.Score) {
BestOp.Idx = Idx;
BestOp.Score = Score;
case ReorderingMode::Splat:
if (Op == OpLastLane)
BestOp.Idx = Idx;
case ReorderingMode::Failed:
return None;
if (BestOp.Idx) {
getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
return BestOp.Idx;
// If we could not find a good match return None.
return None;
/// Helper for reorderOperandVecs. \Returns the lane that we should start
/// reordering from. This is the one which has the least number of operands
/// that can freely move about.
unsigned getBestLaneToStartReordering() const {
unsigned BestLane = 0;
unsigned Min = UINT_MAX;
for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
++Lane) {
unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
if (NumFreeOps < Min) {
Min = NumFreeOps;
BestLane = Lane;
return BestLane;
/// \Returns the maximum number of operands that are allowed to be reordered
/// for \p Lane. This is used as a heuristic for selecting the first lane to
/// start operand reordering.
unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
unsigned CntTrue = 0;
unsigned NumOperands = getNumOperands();
// Operands with the same APO can be reordered. We therefore need to count
// how many of them we have for each APO, like this: Cnt[APO] = x.
// Since we only have two APOs, namely true and false, we can avoid using
// a map. Instead we can simply count the number of operands that
// correspond to one of them (in this case the 'true' APO), and calculate
// the other by subtracting it from the total number of operands.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
if (getData(OpIdx, Lane).APO)
unsigned CntFalse = NumOperands - CntTrue;
return std::max(CntTrue, CntFalse);
/// Go through the instructions in VL and append their operands.
void appendOperandsOfVL(ArrayRef<Value *> VL) {
assert(!VL.empty() && "Bad VL");
assert((empty() || VL.size() == getNumLanes()) &&
"Expected same number of lanes");
assert(isa<Instruction>(VL[0]) && "Expected instruction");
unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
// Our tree has just 3 nodes: the root and two operands.
// It is therefore trivial to get the APO. We only need to check the
// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
// RHS operand. The LHS operand of both add and sub is never attached
// to an inversese operation in the linearized form, therefore its APO
// is false. The RHS is true only if VL[Lane] is an inverse operation.
// Since operand reordering is performed on groups of commutative
// operations or alternating sequences (e.g., +, -), we can safely
// tell the inverse operations by checking commutativity.
bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
bool APO = (OpIdx == 0) ? false : IsInverseOperation;
OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
APO, false};
/// \returns the number of operands.
unsigned getNumOperands() const { return OpsVec.size(); }
/// \returns the number of lanes.
unsigned getNumLanes() const { return OpsVec[0].size(); }
/// \returns the operand value at \p OpIdx and \p Lane.
Value *getValue(unsigned OpIdx, unsigned Lane) const {
return getData(OpIdx, Lane).V;
/// \returns true if the data structure is empty.
bool empty() const { return OpsVec.empty(); }
/// Clears the data.
void clear() { OpsVec.clear(); }
/// \Returns true if there are enough operands identical to \p Op to fill
/// the whole vector.
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
bool OpAPO = getData(OpIdx, Lane).APO;
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
if (Ln == Lane)
// This is set to true if we found a candidate for broadcast at Lane.
bool FoundCandidate = false;
for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
OperandData &Data = getData(OpI, Ln);
if (Data.APO != OpAPO || Data.IsUsed)
if (Data.V == Op) {
FoundCandidate = true;
Data.IsUsed = true;
if (!FoundCandidate)
return false;
return true;
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
ScalarEvolution &SE, const BoUpSLP &R)
: DL(DL), SE(SE), R(R) {
// Append all the operands of RootVL.
/// \Returns a value vector with the operands across all lanes for the
/// opearnd at \p OpIdx.
ValueList getVL(unsigned OpIdx) const {
ValueList OpVL(OpsVec[OpIdx].size());
assert(OpsVec[OpIdx].size() == getNumLanes() &&
"Expected same num of lanes across all operands");
for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
OpVL[Lane] = OpsVec[OpIdx][Lane].V;
return OpVL;
// Performs operand reordering for 2 or more operands.
// The original operands are in OrigOps[OpIdx][Lane].
// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
void reorder() {
unsigned NumOperands = getNumOperands();
unsigned NumLanes = getNumLanes();
// Each operand has its own mode. We are using this mode to help us select
// the instructions for each lane, so that they match best with the ones
// we have selected so far.
SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
// This is a greedy single-pass algorithm. We are going over each lane
// once and deciding on the best order right away with no back-tracking.
// However, in order to increase its effectiveness, we start with the lane
// that has operands that can move the least. For example, given the
// following lanes:
// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
// we will start at Lane 1, since the operands of the subtraction cannot
// be reordered. Then we will visit the rest of the lanes in a circular
// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
// Find the first lane that we will start our search from.
unsigned FirstLane = getBestLaneToStartReordering();
// Initialize the modes.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
Value *OpLane0 = getValue(OpIdx, FirstLane);
// Keep track if we have instructions with all the same opcode on one
// side.
if (isa<LoadInst>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Load;
else if (isa<Instruction>(OpLane0)) {
// Check if OpLane0 should be broadcast.
if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
ReorderingModes[OpIdx] = ReorderingMode::Splat;
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
else if (isa<Constant>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Constant;
else if (isa<Argument>(OpLane0))
// Our best hope is a Splat. It may save some cost in some cases.
ReorderingModes[OpIdx] = ReorderingMode::Splat;
// NOTE: This should be unreachable.
ReorderingModes[OpIdx] = ReorderingMode::Failed;
// If the initial strategy fails for any of the operand indexes, then we
// perform reordering again in a second pass. This helps avoid assigning
// high priority to the failed strategy, and should improve reordering for
// the non-failed operand indexes.
for (int Pass = 0; Pass != 2; ++Pass) {
// Skip the second pass if the first pass did not fail.
bool StrategyFailed = false;
// Mark all operand data as free to use.
// We keep the original operand order for the FirstLane, so reorder the
// rest of the lanes. We are visiting the nodes in a circular fashion,
// using FirstLane as the center point and increasing the radius
// distance.
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
// Visit the lane on the right and then the lane on the left.
for (int Direction : {+1, -1}) {
int Lane = FirstLane + Direction * Distance;
if (Lane < 0 || Lane >= (int)NumLanes)
int LastLane = Lane - Direction;
assert(LastLane >= 0 && LastLane < (int)NumLanes &&
"Out of bounds");
// Look for a good match for each operand.
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
Optional<unsigned> BestIdx =
getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
// By not selecting a value, we allow the operands that follow to
// select a better matching value. We will get a non-null value in
// the next run of getBestOperand().
if (BestIdx) {
// Swap the current operand with the one returned by
// getBestOperand().
swap(OpIdx, BestIdx.getValue(), Lane);
} else {
// We failed to find a best operand, set mode to 'Failed'.
ReorderingModes[OpIdx] = ReorderingMode::Failed;
// Enable the second pass.
StrategyFailed = true;
// Skip second pass if the strategy did not fail.
if (!StrategyFailed)
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
switch (RMode) {
case ReorderingMode::Load:
return "Load";
case ReorderingMode::Opcode:
return "Opcode";
case ReorderingMode::Constant:
return "Constant";
case ReorderingMode::Splat:
return "Splat";
case ReorderingMode::Failed:
return "Failed";
llvm_unreachable("Unimplemented Reordering Type");
LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
raw_ostream &OS) {
return OS << getModeStr(RMode);
/// Debug print.
LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
printMode(RMode, dbgs());
friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
return printMode(RMode, OS);
LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
const unsigned Indent = 2;
unsigned Cnt = 0;
for (const OperandDataVec &OpDataVec : OpsVec) {
OS << "Operand " << Cnt++ << "\n";
for (const OperandData &OpData : OpDataVec) {
OS.indent(Indent) << "{";
if (Value *V = OpData.V)
OS << *V;
OS << "null";
OS << ", APO:" << OpData.APO << "}\n";
OS << "\n";
return OS;
/// Debug print.
LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
/// Checks if the instruction is marked for deletion.
bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
/// Marks values operands for later deletion by replacing them with Undefs.
void eraseInstructions(ArrayRef<Value *> AV);
/// Checks if all users of \p I are the part of the vectorization tree.
bool areAllUsersVectorized(Instruction *I) const;
/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
const EdgeInfo &EI);
/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a
/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
/// returns false, setting \p CurrentOrder to either an empty vector or a
/// non-identity permutation that allows to reuse extract instructions.
bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getGatherCost(VectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const;
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(TreeEntry *E);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree() const;
/// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
const DataLayout &DL,
ScalarEvolution &SE,
const BoUpSLP &R);
struct TreeEntry {
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
TreeEntry(VecTreeTy &Container) : Container(Container) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
if (VL.size() == Scalars.size())
return std::equal(VL.begin(), VL.end(), Scalars.begin());
return VL.size() == ReuseShuffleIndices.size() &&
VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
[this](Value *V, int Idx) { return V == Scalars[Idx]; });
/// A vector of scalars.
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
Value *VectorizedValue = nullptr;
/// Do we need to gather this sequence ?
enum EntryState { Vectorize, NeedToGather };
EntryState State;
/// Does this sequence require some shuffling?
SmallVector<int, 4> ReuseShuffleIndices;
/// Does this entry require reordering?
ArrayRef<unsigned> ReorderIndices;
/// Points back to the VectorizableTree.
/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
/// to be a pointer and needs to be able to initialize the child iterator.
/// Thus we need a reference back to the container to translate the indices
/// to entries.
VecTreeTy &Container;
/// The TreeEntry index containing the user of this entry. We can actually
/// have multiple users so the data structure is not truly a tree.
SmallVector<EdgeInfo, 1> UserTreeIndices;
/// The index of this treeEntry in VectorizableTree.
int Idx = -1;
/// The operands of each instruction in each lane Operands[op_index][lane].
/// Note: This helps avoid the replication of the code that performs the
/// reordering of operands during buildTree_rec() and vectorizeTree().
SmallVector<ValueList, 2> Operands;
/// The main/alternate instruction.
Instruction *MainOp = nullptr;
Instruction *AltOp = nullptr;
/// Set this bundle's \p OpIdx'th operand to \p OpVL.
void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
if (Operands.size() < OpIdx + 1)
Operands.resize(OpIdx + 1);
assert(Operands[OpIdx].size() == 0 && "Already resized?");
for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
Operands[OpIdx][Lane] = OpVL[Lane];
/// Set the operands of this bundle in their original order.
void setOperandsInOrder() {
assert(Operands.empty() && "Already initialized?");
auto *I0 = cast<Instruction>(Scalars[0]);
unsigned NumLanes = Scalars.size();
for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
OpIdx != NumOperands; ++OpIdx) {
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
auto *I = cast<Instruction>(Scalars[Lane]);
assert(I->getNumOperands() == NumOperands &&
"Expected same number of operands");
Operands[OpIdx][Lane] = I->getOperand(OpIdx);
/// \returns the \p OpIdx operand of this TreeEntry.
ValueList &getOperand(unsigned OpIdx) {
assert(OpIdx < Operands.size() && "Off bounds");
return Operands[OpIdx];
/// \returns the number of operands.
unsigned getNumOperands() const { return Operands.size(); }
/// \return the single \p OpIdx operand.
Value *getSingleOperand(unsigned OpIdx) const {
assert(OpIdx < Operands.size() && "Off bounds");
assert(!Operands[OpIdx].empty() && "No operand available");
return Operands[OpIdx][0];
/// Some of the instructions in the list have alternate opcodes.
bool isAltShuffle() const {
return getOpcode() != getAltOpcode();
bool isOpcodeOrAlt(Instruction *I) const {
unsigned CheckedOpcode = I->getOpcode();
return (getOpcode() == CheckedOpcode ||
getAltOpcode() == CheckedOpcode);
/// Chooses the correct key for scheduling data. If \p Op has the same (or
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
/// \p OpValue.
Value *isOneOf(Value *Op) const {
auto *I = dyn_cast<Instruction>(Op);
if (I && isOpcodeOrAlt(I))
return Op;
return MainOp;
void setOperations(const InstructionsState &S) {
MainOp = S.MainOp;
AltOp = S.AltOp;
Instruction *getMainOp() const {
return MainOp;
Instruction *getAltOp() const {
return AltOp;
/// The main/alternate opcodes for the list of instructions.
unsigned getOpcode() const {
return MainOp ? MainOp->getOpcode() : 0;
unsigned getAltOpcode() const {
return AltOp ? AltOp->getOpcode() : 0;
/// Update operations state of this entry if reorder occurred.
bool updateStateIfReorder() {
if (ReorderIndices.empty())
return false;
InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
return true;
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
dbgs() << Idx << ".\n";
for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
dbgs() << "Operand " << OpI << ":\n";
for (const Value *V : Operands[OpI])
dbgs().indent(2) << *V << "\n";
dbgs() << "Scalars: \n";
for (Value *V : Scalars)
dbgs().indent(2) << *V << "\n";
dbgs() << "State: ";
switch (State) {
case Vectorize:
dbgs() << "Vectorize\n";
case NeedToGather:
dbgs() << "NeedToGather\n";
dbgs() << "MainOp: ";
if (MainOp)
dbgs() << *MainOp << "\n";
dbgs() << "NULL\n";
dbgs() << "AltOp: ";
if (AltOp)
dbgs() << *AltOp << "\n";
dbgs() << "NULL\n";
dbgs() << "VectorizedValue: ";
if (VectorizedValue)
dbgs() << *VectorizedValue << "\n";
dbgs() << "NULL\n";
dbgs() << "ReuseShuffleIndices: ";
if (ReuseShuffleIndices.empty())
dbgs() << "Emtpy";
for (unsigned ReuseIdx : ReuseShuffleIndices)
dbgs() << ReuseIdx << ", ";
dbgs() << "\n";
dbgs() << "ReorderIndices: ";
for (unsigned ReorderIdx : ReorderIndices)
dbgs() << ReorderIdx << ", ";
dbgs() << "\n";
dbgs() << "UserTreeIndices: ";
for (const auto &EInfo : UserTreeIndices)
dbgs() << EInfo << ", ";
dbgs() << "\n";
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
ArrayRef<unsigned> ReuseShuffleIndices = None,
ArrayRef<unsigned> ReorderIndices = None) {
bool Vectorized = (bool)Bundle;
TreeEntry *Last = VectorizableTree.back().get();
Last->Idx = VectorizableTree.size() - 1;
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
Last->ReorderIndices = ReorderIndices;
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
ScalarToTreeEntry[VL[i]] = Last;
// Update the scheduler bundle to point to this TreeEntry.
unsigned Lane = 0;
for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
BundleMember = BundleMember->NextInBundle) {
BundleMember->TE = Last;
BundleMember->Lane = Lane;
assert((!Bundle.getValue() || Lane == VL.size()) &&
"Bundle and VL out of sync");
} else {
MustGather.insert(VL.begin(), VL.end());
if (UserTreeIdx.UserTE)
return Last;
/// -- Vectorization State --
/// Holds all of the tree entries.
TreeEntry::VecTreeTy VectorizableTree;
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dumpVectorizableTree() const {
for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
dbgs() << "\n";
TreeEntry *getTreeEntry(Value *V) {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
return I->second;
return nullptr;
const TreeEntry *getTreeEntry(Value *V) const {
auto I = ScalarToTreeEntry.find(V);
if (I != ScalarToTreeEntry.end())
return I->second;
return nullptr;
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
/// Maps a value to the proposed vectorizable size.
SmallDenseMap<Value *, unsigned> InstrElementSize;
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser(Value *S, llvm::User *U, int L)
: Scalar(S), User(U), Lane(L) {}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
llvm::User *User;
// Which lane does the scalar belong to.
int Lane;
using UserList = SmallVector<ExternalUser, 16>;
/// Checks if two instructions may access the same memory.
/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
if (result.hasValue()) {
return result.getValue();
MemoryLocation Loc2 = getLocation(Inst2, AA);
bool aliased = true;
if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
// Do the alias check.
aliased = AA->alias(Loc1, Loc2);
// Store the result in the cache.
result = aliased;
return aliased;
using AliasCacheKey = std::pair<Instruction *, Instruction *>;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
/// Removes an instruction from its block and eventually deletes it.
/// It's like Instruction::eraseFromParent() except that the actual deletion
/// is delayed until BoUpSLP is destructed.
/// This is required to ensure that there are no incorrect collisions in the
/// AliasCache, which can happen if a new instruction is allocated at the
/// same address as a previously deleted instruction.
void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
/// Temporary store for deleted instructions. Instructions will be deleted
/// eventually when the BoUpSLP is destructed.
DenseMap<Instruction *, bool> DeletedInstructions;
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User). External User
/// can be nullptr, it means that this Internal Scalar will be used later,
/// after vectorization.
UserList ExternalUses;
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
/// instruction bundle (= a group of instructions which is combined into a
/// vector instruction).
struct ScheduleData {
// The initial value for the dependency counters. It means that the
// dependencies are not calculated yet.
enum { InvalidDeps = -1 };
ScheduleData() = default;
void init(int BlockSchedulingRegionID, Value *OpVal) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
UnscheduledDepsInBundle = UnscheduledDeps;
OpValue = OpVal;
TE = nullptr;
Lane = -1;
/// Returns true if the dependency information has been calculated.
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
/// Returns true for single instructions and for bundle representatives
/// (= the head of a bundle).
bool isSchedulingEntity() const { return FirstInBundle == this; }
/// Returns true if it represents an instruction bundle and not only a
/// single instruction.
bool isPartOfBundle() const {
return NextInBundle != nullptr || FirstInBundle != this;
/// Returns true if it is ready for scheduling, i.e. it has no more
/// unscheduled depending instructions/bundles.
bool isReady() const {
assert(isSchedulingEntity() &&
"can't consider non-scheduling entity for ready list");
return UnscheduledDepsInBundle == 0 && !IsScheduled;
/// Modifies the number of unscheduled dependencies, also updating it for
/// the whole bundle.
int incrementUnscheduledDeps(int Incr) {
UnscheduledDeps += Incr;
return FirstInBundle->UnscheduledDepsInBundle += Incr;
/// Sets the number of unscheduled dependencies to the number of
/// dependencies.
void resetUnscheduledDeps() {
incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
/// Clears all dependency information.
void clearDependencies() {
Dependencies = InvalidDeps;
void dump(raw_ostream &os) const {
if (!isSchedulingEntity()) {
os << "/ " << *Inst;
} else if (NextInBundle) {
os << '[' << *Inst;
ScheduleData *SD = NextInBundle;
while (SD) {
os << ';' << *SD->Inst;
SD = SD->NextInBundle;
os << ']';
} else {
os << *Inst;
Instruction *Inst = nullptr;
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
ScheduleData *FirstInBundle = nullptr;
/// Single linked list of all instructions in a bundle. Null if it is a
/// single instruction.
ScheduleData *NextInBundle = nullptr;
/// Single linked list of all memory instructions (e.g. load, store, call)
/// in the block - until the end of the scheduling region.
ScheduleData *NextLoadStore = nullptr;
/// The dependent memory instructions.
/// This list is derived on demand in calculateDependencies().
SmallVector<ScheduleData *, 4> MemoryDependencies;
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
int SchedulingRegionID = 0;
/// Used for getting a "good" final ordering of instructions.
int SchedulingPriority = 0;
/// The number of dependencies. Constitutes of the number of users of the
/// instruction plus the number of dependent memory instructions (if any).
/// This value is calculated on demand.
/// If InvalidDeps, the number of dependencies is not calculated yet.
int Dependencies = InvalidDeps;
/// The number of dependencies minus the number of dependencies of scheduled
/// instructions. As soon as this is zero, the instruction/bundle gets ready
/// for scheduling.
/// Note that this is negative as long as Dependencies is not calculated.
int UnscheduledDeps = InvalidDeps;
/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
/// single instructions.
int UnscheduledDepsInBundle = InvalidDeps;
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled = false;
/// Opcode of the current instruction in the schedule data.
Value *OpValue = nullptr;
/// The TreeEntry that this instruction corresponds to.
TreeEntry *TE = nullptr;
/// The lane of this node in the TreeEntry.
int Lane = -1;
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &os,
const BoUpSLP::ScheduleData &SD) {
return os;
friend struct GraphTraits<BoUpSLP *>;
friend struct DOTGraphTraits<BoUpSLP *>;
/// Contains all scheduling data for a basic block.
struct BlockScheduling {
BlockScheduling(BasicBlock *BB)
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
void clear() {
ScheduleStart = nullptr;
ScheduleEnd = nullptr;
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
// Reduce the maximum schedule region size by the size of the
// previous scheduling run.
ScheduleRegionSizeLimit -= ScheduleRegionSize;
if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
ScheduleRegionSizeLimit = MinScheduleRegionSize;
ScheduleRegionSize = 0;
// Make a new scheduling region, i.e. all existing ScheduleData is not
// in the new region yet.
ScheduleData *getScheduleData(Value *V) {
ScheduleData *SD = ScheduleDataMap[V];
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
return SD;
return nullptr;
ScheduleData *getScheduleData(Value *V, Value *Key) {
if (V == Key)
return getScheduleData(V);
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end()) {
ScheduleData *SD = I->second[Key];
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
return SD;
return nullptr;
bool isInSchedulingRegion(ScheduleData *SD) const {
return SD->SchedulingRegionID == SchedulingRegionID;
/// Marks an instruction as scheduled and puts all dependent ready
/// instructions into the ready-list.
template <typename ReadyListType>
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
SD->IsScheduled = true;
LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
ScheduleData *BundleMember = SD;
while (BundleMember) {
if (BundleMember->Inst != BundleMember->OpValue) {
BundleMember = BundleMember->NextInBundle;
// Handle the def-use chain dependencies.
// Decrement the unscheduled counter and insert to ready list if ready.
auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
if (OpDef && OpDef->hasValidDependencies() &&
OpDef->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after
// decrementing, so we can put the dependent instruction
// into the ready list.
ScheduleData *DepBundle = OpDef->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
<< "SLP: gets ready (def): " << *DepBundle << "\n");
// If BundleMember is a vector bundle, its operands may have been
// reordered duiring buildTree(). We therefore need to get its operands
// through the TreeEntry.
if (TreeEntry *TE = BundleMember->TE) {
int Lane = BundleMember->Lane;
assert(Lane >= 0 && "Lane not set");
// Since vectorization tree is being built recursively this assertion
// ensures that the tree entry has all operands set before reaching
// this code. Couple of exceptions known at the moment are extracts
// where their second (immediate) operand is not added. Since
// immediates do not affect scheduler behavior this is considered
// okay.
auto *In = TE->getMainOp();
assert(In &&
(isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
In->getNumOperands() == TE->getNumOperands()) &&
"Missed TreeEntry operands?");
(void)In; // fake use to avoid build failure when assertions disabled
for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
OpIdx != NumOperands; ++OpIdx)
if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
} else {
// If BundleMember is a stand-alone instruction, no operand reordering
// has taken place, so we directly access its operands.
for (Use &U : BundleMember->Inst->operands())
if (auto *I = dyn_cast<Instruction>(U.get()))
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
<< "SLP: gets ready (mem): " << *DepBundle << "\n");
BundleMember = BundleMember->NextInBundle;
void doForAllOpcodes(Value *V,
function_ref<void(ScheduleData *SD)> Action) {
if (ScheduleData *SD = getScheduleData(V))
auto I = ExtraScheduleDataMap.find(V);
if (I != ExtraScheduleDataMap.end())
for (auto &P : I->second)
if (P.second->SchedulingRegionID == SchedulingRegionID)
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
doForAllOpcodes(I, [&](ScheduleData *SD) {
if (SD->isSchedulingEntity() && SD->isReady()) {
<< "SLP: initially in ready list: " << *I << "\n");
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
/// \returns the scheduling bundle. The returned Optional value is non-None
/// if \p VL is allowed to be scheduled.
Optional<ScheduleData *>
tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
/// Allocates schedule data chunk.
ScheduleData *allocateScheduleDataChunks();
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
bool extendSchedulingRegion(Value *V, const InstructionsState &S);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
void initScheduleData(Instruction *FromI, Instruction *ToI,
ScheduleData *PrevLoadStore,
ScheduleData *NextLoadStore);
/// Updates the dependency information of a bundle and of all instructions/
/// bundles which depend on the original bundle.
void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
/// Sets all instruction in the scheduling region to un-scheduled.
void resetSchedule();
BasicBlock *BB;
/// Simple memory allocation for ScheduleData.
std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
/// The size of a ScheduleData array in ScheduleDataChunks.
int ChunkSize;
/// The allocator position in the current chunk, which is the last entry
/// of ScheduleDataChunks.
int ChunkPos;
/// Attaches ScheduleData to Instruction.
/// Note that the mapping survives during all vectorization iterations, i.e.
/// ScheduleData structures are recycled.
DenseMap<Value *, ScheduleData *> ScheduleDataMap;
/// Attaches ScheduleData to Instruction with the leading key.
DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
struct ReadyList : SmallVector<ScheduleData *, 8> {
void insert(ScheduleData *SD) { push_back(SD); }
/// The ready-list for scheduling (only used for the dry-run).
ReadyList ReadyInsts;
/// The first instruction of the scheduling region.
Instruction *ScheduleStart = nullptr;
/// The first instruction _after_ the scheduling region.
Instruction *ScheduleEnd = nullptr;
/// The first memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *FirstLoadStoreInRegion = nullptr;
/// The last memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *LastLoadStoreInRegion = nullptr;
/// The current size of the scheduling region.
int ScheduleRegionSize = 0;
/// The maximum size allowed for the scheduling region.
int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
// Make sure that the initial SchedulingRegionID is greater than the
// initial SchedulingRegionID in ScheduleData (which is 0).
int SchedulingRegionID = 1;
/// Attaches the BlockScheduling structures to basic blocks.
MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
void scheduleBlock(BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
using OrdersType = SmallVector<unsigned, 4>;
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
/// sorted SmallVectors of unsigned.
struct OrdersTypeDenseMapInfo {
static OrdersType getEmptyKey() {
OrdersType V;
return V;
static OrdersType getTombstoneKey() {
OrdersType V;
return V;
static unsigned getHashValue(const OrdersType &V) {
return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
return LHS == RHS;
/// Contains orders of operations along with the number of bundles that have
/// operations in this order. It stores only those orders that require
/// reordering, if reordering is not required it is counted using \a
/// NumOpsWantToKeepOriginalOrder.
DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
/// Number of bundles that do not require reordering.
unsigned NumOpsWantToKeepOriginalOrder = 0;
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
OptimizationRemarkEmitter *ORE;
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
/// A map of scalar integer values to the smallest bit width with which they
/// can legally be represented. The values map to (width, signed) pairs,
/// where "width" indicates the minimum bit width and "signed" is True if the
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
} // end namespace slpvectorizer
template <> struct GraphTraits<BoUpSLP *> {
using TreeEntry = BoUpSLP::TreeEntry;
/// NodeRef has to be a pointer per the GraphWriter.
using NodeRef = TreeEntry *;
using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
/// Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
struct ChildIteratorType
: public iterator_adaptor_base<
ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
ContainerTy &VectorizableTree;
ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
ContainerTy &VT)
: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
NodeRef operator*() { return I->UserTE; }
static NodeRef getEntryNode(BoUpSLP &R) {
return R.VectorizableTree[0].get();
static ChildIteratorType child_begin(NodeRef N) {
return {N->UserTreeIndices.begin(), N->Container};
static ChildIteratorType child_end(NodeRef N) {
return {N->UserTreeIndices.end(), N->Container};
/// For the node iterator we just need to turn the TreeEntry iterator into a
/// TreeEntry* iterator so that it dereferences to NodeRef.
class nodes_iterator {
using ItTy = ContainerTy::iterator;
ItTy It;
nodes_iterator(const ItTy &It2) : It(It2) {}
NodeRef operator*() { return It->get(); }
nodes_iterator operator++() {
return *this;
bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
static nodes_iterator nodes_begin(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.begin());
static nodes_iterator nodes_end(BoUpSLP *R) {
return nodes_iterator(R->VectorizableTree.end());
static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
using TreeEntry = BoUpSLP::TreeEntry;
DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
std::string Str;
raw_string_ostream OS(Str);
if (isSplat(Entry->Scalars)) {
OS << "<splat> " << *Entry->Scalars[0];
return Str;
for (auto V : Entry->Scalars) {
OS << *V;
if (std::any_of(
R->ExternalUses.begin(), R->ExternalUses.end(),
[&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
OS << " <extract>";
OS << "\n";
return Str;
static std::string getNodeAttributes(const TreeEntry *Entry,
const BoUpSLP *) {
if (Entry->State == TreeEntry::NeedToGather)
return "color=red";
return "";
} // end namespace llvm
BoUpSLP::~BoUpSLP() {
for (const auto &Pair : DeletedInstructions) {
// Replace operands of ignored instructions with Undefs in case if they were
// marked for deletion.
if (Pair.getSecond()) {
Value *Undef = UndefValue::get(Pair.getFirst()->getType());
for (const auto &Pair : DeletedInstructions) {
assert(Pair.getFirst()->use_empty() &&
"trying to erase instruction with users.");
assert(!verifyFunction(*F, &dbgs()));
void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
for (auto *V : AV) {
if (auto *I = dyn_cast<Instruction>(V))
eraseInstruction(I, /*ReplaceWithUndef=*/true);
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
ExtraValueToDebugLocsMap ExternallyUsedValues;
buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst) {
UserIgnoreList = UserIgnoreLst;
if (!allSameType(Roots))
buildTree_rec(Roots, 0, EdgeInfo());
// Collect the values that we need to extract from the tree.
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
if (Entry->State == TreeEntry::NeedToGather)
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
int FoundLane = Lane;
if (!Entry->ReuseShuffleIndices.empty()) {
FoundLane =
llvm::find(Entry->ReuseShuffleIndices, FoundLane));
// Check if the scalar is externally used as an extra arg.
auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n");
ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
for (User *U : Scalar->users()) {
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
// Skip in-tree scalars that become vectors
if (TreeEntry *UseEntry = getTreeEntry(U)) {
Value *UseScalar = UseEntry->Scalars[0];
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in Lane 0 will
// be used.
if (UseScalar != U ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
// Ignore users in the user ignore list.
if (is_contained(UserIgnoreList, UserInst))
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
<< Lane << " from " << *Scalar << ".\n");
ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
const EdgeInfo &UserTreeIdx) {
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
InstructionsState S = getSameOpcode(VL);
if (Depth == RecursionMaxDepth) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
// Don't handle vectors.
if (S.OpValue->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
if (SI->getValueOperand()->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
// If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
// We now know that this is a vector of instructions of the same type from
// the same block.
// Don't vectorize ephemeral values.
for (Value *V : VL) {
if (EphValues.count(V)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is ephemeral.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
// Check if this is a duplicate of another entry.
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
if (!E->isSame(VL)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
// Record the reuse of the tree node. FIXME, currently this is only used to
// properly draw the graph rather than for the actual vectorization.
LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
<< ".\n");
// Check that none of the instructions in the bundle are already in the tree.
for (Value *V : VL) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
if (getTreeEntry(I)) {
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
<< ") is already in tree.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
// If any of the scalars is marked as a value that needs to stay scalar, then
// we need to gather the scalars.
// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
for (Value *V : VL) {
if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
auto *VL0 = cast<Instruction>(S.OpValue);
BasicBlock *BB = VL0->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
// Check that every instruction appears once in this bundle.
SmallVector<unsigned, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
DenseMap<Value *, unsigned> UniquePositions;
for (Value *V : VL) {
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
if (Res.second)
size_t NumUniqueScalarValues = UniqueValues.size();
if (NumUniqueScalarValues == VL.size()) {
} else {
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
if (NumUniqueScalarValues <= 1 ||
!llvm::isPowerOf2_32(NumUniqueScalarValues)) {
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
VL = UniqueValues;
auto &BSRef = BlocksSchedules[BB];
if (!BSRef)
BSRef = std::make_unique<BlockScheduling>(BB);
BlockScheduling &BS = *BSRef.get();
Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
if (!Bundle) {
LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL0) ||
!BS.getScheduleData(VL0)->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
unsigned ShuffleOrOp = S.isAltShuffle() ?
(unsigned) Instruction::ShuffleVector : S.getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
auto *PH = cast<PHINode>(VL0);
// Check for terminator values (e.g. invoke).
for (unsigned j = 0; j < VL.size(); ++j)
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
Instruction *Term = dyn_cast<Instruction>(
if (Term && Term->isTerminator()) {
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
TreeEntry *TE =
newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
// Keeps the reordered operands to avoid code duplication.
SmallVector<ValueList, 2> OperandsVec;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
TE->setOperand(i, Operands);
for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
OrdersType CurrentOrder;
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
if (Reuse) {
LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
Op0.assign(VL.size(), VL0->getOperand(0));
VectorizableTree.back()->setOperand(0, Op0);
if (!CurrentOrder.empty()) {
dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order";
for (unsigned Idx : CurrentOrder)
dbgs() << " " << Idx;
dbgs() << "\n";
// Insert new order with initial value 0, if it does not exist,
// otherwise return the iterator to the existing one.
auto StoredCurrentOrderAndNum =
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
Op0.assign(VL.size(), VL0->getOperand(0));
VectorizableTree.back()->setOperand(0, Op0);
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
BS.cancelScheduling(VL, VL0);
case Instruction::Load: {
// Check that a vectorized load would load the same memory as a scalar
// load. For example, we don't want to vectorize loads that are smaller
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
// from such a struct, we read/write packed bits disagreeing with the
// unvectorized version.
Type *ScalarTy = VL0->getType();
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
SmallVector<Value *, 4> PointerOps(VL.size());
auto POIter = PointerOps.begin();
for (Value *V : VL) {
auto *L = cast<LoadInst>(V);
if (!L->isSimple()) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
*POIter = L->getPointerOperand();
OrdersType CurrentOrder;
// Check the order of pointer operands.
if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
Value *Ptr0;
Value *PtrN;
if (CurrentOrder.empty()) {
Ptr0 = PointerOps.front();
PtrN = PointerOps.back();
} else {
Ptr0 = PointerOps[CurrentOrder.front()];
PtrN = PointerOps[CurrentOrder.back()];
const SCEV *Scev0 = SE->getSCEV(Ptr0);
const SCEV *ScevN = SE->getSCEV(PtrN);
const auto *Diff =
dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
uint64_t Size = DL->getTypeAllocSize(ScalarTy);
// Check that the sorted loads are consecutive.
if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
UserTreeIdx, ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
// Need to reorder.
auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
TreeEntry *TE =
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, I->getFirst());
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
for (Value *V : VL) {
Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
<< "SLP: Gathering casts with different src types.\n");
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
buildTree_rec(Operands, Depth + 1, {TE, i});
case Instruction::ICmp:
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
Type *ComparedTy = VL0->getOperand(0)->getType();
for (Value *V : VL) {
CmpInst *Cmp = cast<CmpInst>(V);
if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
<< "SLP: Gathering cmp with different predicate.\n");
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
ValueList Left, Right;
if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
auto *Cmp = cast<CmpInst>(V);
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);
if (Cmp->getPredicate() != P0)
std::swap(LHS, RHS);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
case Instruction::Select:
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
buildTree_rec(Operands, Depth + 1, {TE, i});
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (Value *V : VL) {
if (cast<Instruction>(V)->getNumOperands() != 2) {
LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
// We can't combine several GEPs into one vector if they operate on
// different types.
Type *Ty0 = VL0->getOperand(0)->getType();
for (Value *V : VL) {
Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
if (Ty0 != CurTy) {
<< "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
// We don't combine GEPs with non-constant indexes.
Type *Ty1 = VL0->getOperand(1)->getType();
for (Value *V : VL) {
auto Op = cast<Instruction>(V)->getOperand(1);
if (!isa<ConstantInt>(Op) ||
(Op->getType() != Ty1 &&
Op->getType()->getScalarSizeInBits() >
V->getType()->getPointerAddressSpace()))) {
<< "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
buildTree_rec(Operands, Depth + 1, {TE, i});
case Instruction::Store: {
// Check if the stores are consecutive or if we need to swizzle them.
llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
// Make sure all stores in the bundle are simple - we can't vectorize
// atomic or volatile stores.
SmallVector<Value *, 4> PointerOps(VL.size());
ValueList Operands(VL.size());
auto POIter = PointerOps.begin();
auto OIter = Operands.begin();
for (Value *V : VL) {
auto *SI = cast<StoreInst>(V);
if (!SI->isSimple()) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
*POIter = SI->getPointerOperand();
*OIter = SI->getValueOperand();
OrdersType CurrentOrder;
// Check the order of pointer operands.
if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
Value *Ptr0;
Value *PtrN;
if (CurrentOrder.empty()) {
Ptr0 = PointerOps.front();
PtrN = PointerOps.back();
} else {
Ptr0 = PointerOps[CurrentOrder.front()];
PtrN = PointerOps[CurrentOrder.back()];
const SCEV *Scev0 = SE->getSCEV(Ptr0);
const SCEV *ScevN = SE->getSCEV(PtrN);
const auto *Diff =
dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
uint64_t Size = DL->getTypeAllocSize(ScalarTy);
// Check that the sorted pointer operands are consecutive.
if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
if (CurrentOrder.empty()) {
// Original stores are consecutive and does not require reordering.
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
UserTreeIdx, ReuseShuffleIndicies);
buildTree_rec(Operands, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
} else {
// Need to reorder.
auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
TreeEntry *TE =
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, I->getFirst());
buildTree_rec(Operands, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
case Instruction::Call: {
// Check if the calls are all to the same vectorizable intrinsic or
// library function.
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
VFShape Shape = VFShape::get(
*CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/},
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
if (!VecFunc && !isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
Function *F = CI->getCalledFunction();
unsigned NumArgs = CI->getNumArgOperands();
SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned j = 0; j != NumArgs; ++j)
if (hasVectorInstrinsicScalarOpd(ID, j))
ScalarArgs[j] = CI->getArgOperand(j);
for (Value *V : VL) {
CallInst *CI2 = dyn_cast<CallInst>(V);
if (!CI2 || CI2->getCalledFunction() != F ||
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
(VecFunc &&
VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
<< "\n");
// Some intrinsics have scalar arguments and should be same in order for
// them to be vectorized.
for (unsigned j = 0; j != NumArgs; ++j) {
if (hasVectorInstrinsicScalarOpd(ID, j)) {
Value *A1J = CI2->getArgOperand(j);
if (ScalarArgs[j] != A1J) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument " << ScalarArgs[j] << "!=" << A1J
<< "\n");
// Verify that the bundle operands are identical between the two calls.
if (CI->hasOperandBundles() &&
!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n');
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL) {
auto *CI2 = cast<CallInst>(V);
buildTree_rec(Operands, Depth + 1, {TE, i});
case Instruction::ShuffleVector: {
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
if (!S.isAltShuffle()) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
buildTree_rec(Operands, Depth + 1, {TE, i});
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N = 1;
Type *EltTy = T;
while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
isa<VectorType>(EltTy)) {
if (auto *ST = dyn_cast<StructType>(EltTy)) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
if (Ty != *ST->element_begin())
return 0;
N *= ST->getNumElements();
EltTy = *ST->element_begin();
} else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
N *= AT->getNumElements();
EltTy = AT->getElementType();
} else {
auto *VT = cast<VectorType>(EltTy);
N *= VT->getNumElements();
EltTy = VT->getElementType();
if (!isValidElementType(EltTy))
return 0;
uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
return N;
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder) const {
Instruction *E0 = cast<Instruction>(OpValue);
assert(E0->getOpcode() == Instruction::ExtractElement ||
E0->getOpcode() == Instruction::ExtractValue);
assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *Vec = E0->getOperand(0);
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (E0->getOpcode() == Instruction::ExtractValue) {
const DataLayout &DL = E0->getModule()->getDataLayout();
NElts = canMapToVector(Vec->getType(), DL);
if (!NElts)
return false;
// Check if load can be rewritten as load of vector.
LoadInst *LI = dyn_cast<LoadInst>(Vec);
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
return false;
} else {
NElts = cast<VectorType>(Vec->getType())->getNumElements();
if (NElts != VL.size())
return false;
// Check that all of the indices extract from the correct offset.
bool ShouldKeepOrder = true;
unsigned E = VL.size();
// Assign to all items the initial value E + 1 so we can check if the extract
// instruction index was used already.
// Also, later we can check that all the indices are used and we have a
// consecutive access in the extract instructions, by checking that no
// element of CurrentOrder still has value E + 1.
CurrentOrder.assign(E, E + 1);
unsigned I = 0;
for (; I < E; ++I) {
auto *Inst = cast<Instruction>(VL[I]);
if (Inst->getOperand(0) != Vec)
Optional<unsigned> Idx = getExtractIndex(Inst);
if (!Idx)
const unsigned ExtIdx = *Idx;
if (ExtIdx != I) {
if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
ShouldKeepOrder = false;
CurrentOrder[ExtIdx] = I;
} else {
if (CurrentOrder[I] != E + 1)
CurrentOrder[I] = I;
if (I < E) {
return false;
return ShouldKeepOrder;
bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
return I->hasOneUse() ||
std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0;
static std::pair<unsigned, unsigned>
getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements());
int IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
auto Shape =
VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
int LibCost = IntrinsicCost;
if (!CI->isNoBuiltin() && VecFunc) {
// Calculate the cost of the vector library call.
SmallVector<Type *, 4> VecTys;
for (Use &Arg : CI->args())
FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
// If the corresponding vector call is cheaper, return its cost.
LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
return {IntrinsicCost, LibCost};
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
ScalarTy = CI->getOperand(0)->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
if (MinBWs.count(VL[0]))
VecTy = FixedVectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
int ReuseShuffleCost = 0;
if (NeedToShuffleReuses) {
ReuseShuffleCost =
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
return ReuseShuffleCost +
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
if (E->getOpcode() == Instruction::ExtractElement &&
allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) {
int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
for (auto *V : VL) {
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
// instruction as dead and remove its cost from the final cost of the
// vectorized tree.
if (areAllUsersVectorized(cast<Instruction>(V)) &&
!ScalarToTreeEntry.count(V)) {
auto *IO = cast<ConstantInt>(
Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
return ReuseShuffleCost + Cost;
return ReuseShuffleCost + getGatherCost(VL);
assert(E->State == TreeEntry::Vectorize && "Unhandled state");
assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI:
return 0;
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
if (NeedToShuffleReuses) {
unsigned Idx = 0;
for (unsigned I : E->ReuseShuffleIndices) {
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *IO = cast<ConstantInt>(
Idx = IO->getZExtValue();
ReuseShuffleCost -= TTI->getVectorInstrCost(
Instruction::ExtractElement, VecTy, Idx);
} else {
ReuseShuffleCost -= TTI->getVectorInstrCost(
Instruction::ExtractElement, VecTy, Idx);
Idx = ReuseShuffleNumbers;
for (Value *V : VL) {
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *IO = cast<ConstantInt>(
Idx = IO->getZExtValue();
} else {
ReuseShuffleCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
int DeadCost = ReuseShuffleCost;
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
DeadCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
if (areAllUsersVectorized(E)) {
// Take credit for instruction that will become dead.
if (E->hasOneUse()) {
Instruction *Ext = E->user_back();
if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
[](User *U) { return isa<GetElementPtrInst>(U); })) {
// Use getExtractWithExtendCost() to calculate the cost of
// extractelement/ext pair.
DeadCost -= TTI->getExtractWithExtendCost(
Ext->getOpcode(), Ext->getType(), VecTy, i);
// Add back the cost of s|zext which is subtracted separately.
DeadCost += TTI->getCastInstrCost(
Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
DeadCost -=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
return DeadCost;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
int ScalarEltCost =
TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
// Calculate the cost of this instruction.
int ScalarCost = VL.size() * ScalarEltCost;
auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
int VecCost = 0;
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
VecCost = ReuseShuffleCost +
TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
CostKind, VL0);
return VecCost - ScalarCost;
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
CostKind, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
// Certain instructions can be cheaper to vectorize if they have a
// constant second vector operand.
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OperandValueProperties Op2VP =
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
// If instead not all operands are constants, then set the operand kind
// to OK_AnyValue. If all operands are constants but not the same,
// then set the operand kind to OK_NonUniformConstantValue.
ConstantInt *CInt0 = nullptr;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
const Instruction *I = cast<Instruction>(VL[i]);
unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
if (!CInt) {
Op2VK = TargetTransformInfo::OK_AnyValue;
Op2VP = TargetTransformInfo::OP_None;
if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
Op2VP = TargetTransformInfo::OP_None;
if (i == 0) {
CInt0 = CInt;
if (CInt0 != CInt)
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
SmallVector<const Value *, 4> Operands(VL0->operand_values());
int ScalarEltCost = TTI->getArithmeticInstrCost(
E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
Operands, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost = TTI->getArithmeticInstrCost(
E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
Operands, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
case Instruction::GetElementPtr: {
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OperandValueKind Op2VK =
int ScalarEltCost =
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
Op1VK, Op2VK);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost =
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
Op1VK, Op2VK);
return ReuseShuffleCost + VecCost - ScalarCost;
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
Align alignment = cast<LoadInst>(VL0)->getAlign();
int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
int VecLdCost =
TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
CostKind, VL0);
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecLdCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
return ReuseShuffleCost + VecLdCost - ScalarLdCost;
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
bool IsReorder = !E->ReorderIndices.empty();
auto *SI =
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
Align Alignment = SI->getAlign();
int ScalarEltCost =
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
CostKind, VL0);
if (NeedToShuffleReuses)
ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
VecTy, Alignment, 0, CostKind, VL0);
if (IsReorder) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecStCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
return ReuseShuffleCost + VecStCost - ScalarStCost;
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
<< " for " << *CI << "\n");
return ReuseShuffleCost + VecCallCost - ScalarCallCost;
case Instruction::ShuffleVector: {
assert(E->isAltShuffle() &&
((Instruction::isBinaryOp(E->getOpcode()) &&
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
Instruction::isCast(E->getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
int ScalarCost = 0;
if (NeedToShuffleReuses) {
for (unsigned Idx : E->ReuseShuffleIndices) {
Instruction *I = cast<Instruction>(VL[Idx]);
ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
ScalarCost += TTI->getInstructionCost(I, CostKind);
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
int VecCost = 0;
if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
} else {
Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
return ReuseShuffleCost + VecCost - ScalarCost;
llvm_unreachable("Unknown instruction");
bool BoUpSLP::isFullyVectorizableTinyTree() const {
LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n");
// We only handle trees of heights 1 and 2.
if (VectorizableTree.size() == 1 &&
VectorizableTree[0]->State == TreeEntry::Vectorize)
return true;
if (VectorizableTree.size() != 2)
return false;
// Handle splat and all-constants stores.
if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
(allConstant(VectorizableTree[1]->Scalars) ||
return true;
// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
VectorizableTree[1]->State == TreeEntry::NeedToGather)
return false;
return true;
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
TargetTransformInfo *TTI) {
// Look past the root to find a source value. Arbitrarily follow the
// path through operand 0 of any 'or'. Also, peek through optional
// shift-left-by-constant.
Value *ZextLoad = Root;
while (!isa<ConstantExpr>(ZextLoad) &&
(match(ZextLoad, m_Or(m_Value(), m_Value())) ||
match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
// Check if the input is an extended load of the required or/shift expression.
Value *LoadPtr;
if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
return false;
// Require that the total load bit width is a legal integer type.
// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
Type *SrcTy = LoadPtr->getType()->getPointerElementType();
unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
return false;
// Everything matched - assume that we can fold the whole sequence using
// load combining.
LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
<< *(cast<Instruction>(Root)) << "\n");
return true;
bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
if (RdxOpcode != Instruction::Or)
return false;
unsigned NumElts = VectorizableTree[0]->Scalars.size();
Value *FirstReduced = VectorizableTree[0]->Scalars[0];
return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
bool BoUpSLP::isLoadCombineCandidate() const {
// Peek through a final sequence of stores and check if all operations are
// likely to be load-combined.
unsigned NumElts = VectorizableTree[0]->Scalars.size();
for (Value *Scalar : VectorizableTree[0]->Scalars) {
Value *X;
if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
!isLoadCombineCandidateImpl(X, NumElts, TTI))
return false;
return true;
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
return false;
// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
// can vectorize it if we can prove it fully vectorizable.
if (isFullyVectorizableTinyTree())
return false;
? ExternalUses.empty()
: true && "We shouldn't have any external users");
// Otherwise, we can't vectorize the tree. It is both tiny and not fully
// vectorizable.
return true;
int BoUpSLP::getSpillCost() const {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
int Cost = 0;
SmallPtrSet<Instruction*, 4> LiveValues;
Instruction *PrevInst = nullptr;
for (const auto &TEPtr : VectorizableTree) {
Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
if (!Inst)
if (!PrevInst) {
PrevInst = Inst;
// Update LiveValues.
for (auto &J : PrevInst->operands()) {
if (isa<Instruction>(&*J) && getTreeEntry(&*J))
dbgs() << "SLP: #LV: " << LiveValues.size();
for (auto *X : LiveValues)
dbgs() << " " << X->getName();
dbgs() << ", Looking at ";
// Now find the sequence of instructions between PrevInst and Inst.
unsigned NumCalls = 0;
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
PrevInstIt =
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
PrevInstIt = Inst->getParent()->rbegin();
// Debug information does not impact spill cost.
if ((isa<CallInst>(&*PrevInstIt) &&
!isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
&*PrevInstIt != PrevInst)
if (NumCalls) {
SmallVector<Type*, 4> V;
for (auto *II : LiveValues)
V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
PrevInst = Inst;
return Cost;
int BoUpSLP::getTreeCost() {
int Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I].get();
// We create duplicate tree entries for gather sequences that have multiple
// uses. However, we should not compute the cost of duplicate sequences.
// For example, if we have a build vector (i.e., insertelement sequence)
// that is used by more than one vector instruction, we only need to
// compute the cost of the insertelement instructions once. The redundant
// instructions will be eliminated by CSE.
// We should consider not creating duplicate tree entries for gather
// sequences, and instead add additional edges to the tree representing
// their uses. Since such an approach results in fewer total entries,
// existing heuristics based on tree size may yield different results.
if (TE.State == TreeEntry::NeedToGather &&
std::any_of(std::next(VectorizableTree.begin(), I + 1),
[TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
return EntryPtr->State == TreeEntry::NeedToGather &&
int C = getEntryCost(&TE);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for bundle that starts with " << *TE.Scalars[0]
<< ".\n");
Cost += C;
SmallPtrSet<Value *, 16> ExtractCostCalculated;
int ExtractCost = 0;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!ExtractCostCalculated.insert(EU.Scalar).second)
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
// removed as well).
if (EphValues.count(EU.User))
// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
if (MinBWs.count(ScalarRoot)) {
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto Extend =
MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
VecTy = FixedVectorType::get(MinTy, BundleWidth);
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
ExtractCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
int SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
std::string Str;
raw_string_ostream OS(Str);
OS << "SLP: Spill Cost = " << SpillCost << ".\n"
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n";
LLVM_DEBUG(dbgs() << Str);
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
return Cost;
int BoUpSLP::getGatherCost(VectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const {
unsigned NumElts = Ty->getNumElements();
APInt DemandedElts = APInt::getNullValue(NumElts);
for (unsigned i = 0; i < NumElts; ++i)
if (!ShuffledIndices.count(i))
int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
/*Extract*/ false);
if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
DenseSet<unsigned> ShuffledElements;
DenseSet<Value *> UniqueElements;
// Iterate in reverse order to consider insert elements with the high cost.
for (unsigned I = VL.size(); I > 0; --I) {
unsigned Idx = I - 1;
if (!UniqueElements.insert(VL[Idx]).second)
return getGatherCost(VecTy, ShuffledElements);
// Perform operand reordering on the instructions in VL and return the reordered
// operands in Left and Right.
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
const DataLayout &DL,
ScalarEvolution &SE,
const BoUpSLP &R) {
if (VL.empty())
VLOperands Ops(VL, DL, SE, R);
// Reorder the operands in place.
Left = Ops.getVL(0);
Right = Ops.getVL(1);
void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
auto *Front = E->getMainOp();
auto *BB = Front->getParent();
assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
[=](Value *V) -> bool {
auto *I = cast<Instruction>(V);
return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
// The last instruction in the bundle in program order.
Instruction *LastInst = nullptr;
// Find the last instruction. The common case should be that BB has been
// scheduled, and the last instruction is VL.back(). So we start with
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
auto *Bundle =
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
if (Bundle->OpValue == Bundle->Inst)
LastInst = Bundle->Inst;
// LastInst can still be null at this point if there's either not an entry
// for BB in BlocksSchedules or there's no ScheduleData available for
// VL.back(). This can be the case if buildTree_rec aborts for various
// reasons (e.g., the maximum recursion depth is reached, the maximum region
// size is reached, etc.). ScheduleData is initialized in the scheduling
// "dry-run".
// If this happens, we can still find the last instruction by brute force. We
// iterate forwards from Front (inclusive) until we either see all
// instructions in the bundle or reach the end of the block. If Front is the
// last instruction in program order, LastInst will be set to Front, and we
// will visit all the remaining instructions in the block.
// One of the reasons we exit early from buildTree_rec is to place an upper
// bound on compile-time. Thus, taking an additional compile-time hit here is
// not ideal. However, this should be exceedingly rare since it requires that
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
if (!LastInst) {
SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
LastInst = &I;
if (Bundle.empty())
assert(LastInst && "Failed to find last instruction in bundle");
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
Builder.SetInsertPoint(BB, ++LastInst->getIterator());
Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
Value *Vec = UndefValue::get(Ty);
// Generate the 'InsertElement' instruction.
for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
// Add to our 'need-to-extract' list.
if (TreeEntry *E = getTreeEntry(VL[i])) {
// Find which lane we need to extract.
int FoundLane = -1;
for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
// Is this the lane of the scalar that we are looking for ?
if (E->Scalars[Lane] == VL[i]) {
FoundLane = Lane;
assert(FoundLane >= 0 && "Could not find the correct lane");
if (!E->ReuseShuffleIndices.empty()) {
FoundLane =
llvm::find(E->ReuseShuffleIndices, FoundLane));
ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
return Vec;
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
InstructionsState S = getSameOpcode(VL);
if (S.getOpcode()) {
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
if (E->isSame(VL)) {
Value *V = vectorizeTree(E);
if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
// We need to get the vectorized value but without shuffle.
if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
V = SV->getOperand(0);
} else {
// Reshuffle to get only unique values.
SmallVector<int, 4> UniqueIdxs;
SmallSet<int, 4> UsedIdxs;
for (int Idx : E->ReuseShuffleIndices)
if (UsedIdxs.insert(Idx).second)
V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
return V;
Type *ScalarTy = S.OpValue->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
ScalarTy = SI->getValueOperand()->getType();
// Check that every instruction appears once in this bundle.
SmallVector<int, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
if (VL.size() > 2) {
DenseMap<Value *, unsigned> UniquePositions;
for (Value *V : VL) {
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
if (Res.second || isa<Constant>(V))
// Do not shuffle single element or if number of unique values is not power
// of 2.
if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
VL = UniqueValues;
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
Value *V = Gather(VL, VecTy);
if (!ReuseShuffleIndicies.empty()) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
ReuseShuffleIndicies, "shuffle");
if (auto *I = dyn_cast<Instruction>(V)) {
return V;
static void inversePermutation(ArrayRef<unsigned> Indices,
SmallVectorImpl<int> &Mask) {
const unsigned E = Indices.size();
for (unsigned I = 0; I < E; ++I)
Mask[Indices[I]] = I;
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
Instruction *VL0 = E->getMainOp();
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
if (E->State == TreeEntry::NeedToGather) {
auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
if (auto *I = dyn_cast<Instruction>(V)) {
E->VectorizedValue = V;
return V;
assert(E->State == TreeEntry::Vectorize && "Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
switch (ShuffleOrOp) {
case Instruction::PHI: {
auto *PH = cast<PHINode>(VL0);
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
Value *V = NewPhi;
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
SmallPtrSet<BasicBlock*, 4> VisitedBBs;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
BasicBlock *IBB = PH->getIncomingBlock(i);
if (!VisitedBBs.insert(IBB).second) {
NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
Value *Vec = vectorizeTree(E->getOperand(i));
NewPhi->addIncoming(Vec, IBB);
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values");
return V;
case Instruction::ExtractElement: {
Value *V = E->getSingleOperand(0);
if (!E->ReorderIndices.empty()) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
if (E->ReorderIndices.empty())
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::ExtractValue: {
LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
PointerType *PtrTy =
PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
Value *NewV = propagateMetadata(V, E->Scalars);
if (!E->ReorderIndices.empty()) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = NewV;
return NewV;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Value *InVec = vectorizeTree(E->getOperand(0));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
auto *CI = cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::FCmp:
case Instruction::ICmp: {
Value *L = vectorizeTree(E->getOperand(0));
Value *R = vectorizeTree(E->getOperand(1));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::Select: {
Value *Cond = vectorizeTree(E->getOperand(0));
Value *True = vectorizeTree(E->getOperand(1));
Value *False = vectorizeTree(E->getOperand(2));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
Value *V = Builder.CreateSelect(Cond, True, False);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::FNeg: {
Value *Op = vectorizeTree(E->getOperand(0));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
Value *V = Builder.CreateUnOp(
static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
propagateIRFlags(V, E->Scalars, VL0);
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
Value *LHS = vectorizeTree(E->getOperand(0));
Value *RHS = vectorizeTree(E->getOperand(1));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
Value *V = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
propagateIRFlags(V, E->Scalars, VL0);
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
bool IsReorder = E->updateStateIfReorder();
if (IsReorder)
VL0 = E->getMainOp();
LoadInst *LI = cast<LoadInst>(VL0);
unsigned AS = LI->getPointerAddressSpace();
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
// The pointer operand uses an in-tree scalar so we add the new BitCast to
// ExternalUses list to make sure that an extract will be generated in the
// future.
Value *PO = LI->getPointerOperand();
if (getTreeEntry(PO))
ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
Value *V = propagateMetadata(LI, E->Scalars);
if (IsReorder) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
Mask, "reorder_shuffle");
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::Store: {
bool IsReorder = !E->ReorderIndices.empty();
auto *SI = cast<StoreInst>(
IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
unsigned AS = SI->getPointerAddressSpace();
Value *VecValue = vectorizeTree(E->getOperand(0));
if (IsReorder) {
SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
VecValue = Builder.CreateShuffleVector(
VecValue, UndefValue::get(VecValue->getType()), Mask,
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
ScalarPtr, VecValue->getType()->getPointerTo(AS));
StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
// The pointer operand uses an in-tree scalar, so add the new BitCast to
// ExternalUses to make sure that an extract will be generated in the
// future.
if (getTreeEntry(ScalarPtr))
ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
Value *V = propagateMetadata(ST, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::GetElementPtr: {
Value *Op0 = vectorizeTree(E->getOperand(0));
std::vector<Value *> OpVecs;
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
++j) {
ValueList &VL = E->getOperand(j);
// Need to cast all elements to the same type before vectorization to
// avoid crash.
Type *VL0Ty = VL0->getOperand(j)->getType();
Type *Ty = llvm::all_of(
VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
? VL0Ty
: DL->getIndexType(cast<GetElementPtrInst>(VL0)
for (Value *&V : VL) {
auto *CI = cast<ConstantInt>(V);
V = ConstantExpr::getIntegerCast(CI, Ty,
Value *OpVec = vectorizeTree(VL);
Value *V = Builder.CreateGEP(
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID IID = Intrinsic::not_intrinsic;
if (Function *FI = CI->getCalledFunction())
IID = FI->getIntrinsicID();
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
VecCallCosts.first <= VecCallCosts.second;
Value *ScalarArg = nullptr;
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
Value *OpVec = vectorizeTree(E->getOperand(j));
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
Function *CF;
if (!UseIntrinsic) {
VFShape Shape = VFShape::get(
*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
false /*HasGlobalPred*/);
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
} else {
Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
SmallVector<OperandBundleDef, 1> OpBundles;
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
// The scalar argument uses an in-tree scalar so we add the new vectorized
// call to ExternalUses list to make sure that an extract will be
// generated in the future.
if (ScalarArg && getTreeEntry(ScalarArg))
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
propagateIRFlags(V, E->Scalars, VL0);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
case Instruction::ShuffleVector: {
assert(E->isAltShuffle() &&
((Instruction::isBinaryOp(E->getOpcode()) &&
Instruction::isBinaryOp(E->getAltOpcode())) ||
(Instruction::isCast(E->getOpcode()) &&
Instruction::isCast(E->getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
Value *LHS = nullptr, *RHS = nullptr;
if (Instruction::isBinaryOp(E->getOpcode())) {
LHS = vectorizeTree(E->getOperand(0));
RHS = vectorizeTree(E->getOperand(1));
} else {
LHS = vectorizeTree(E->getOperand(0));
if (E->VectorizedValue) {
LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return E->VectorizedValue;
Value *V0, *V1;
if (Instruction::isBinaryOp(E->getOpcode())) {
V0 = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
V1 = Builder.CreateBinOp(
static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
} else {
V0 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
V1 = Builder.CreateCast(
static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
// Create shuffle to take alternate operations from the vector.
// Also, gather up main and alt scalar ops to propagate IR flags to
// each vector operation.
ValueList OpScalars, AltScalars;
unsigned e = E->Scalars.size();
SmallVector<int, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
auto *OpInst = cast<Instruction>(E->Scalars[i]);
assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
if (OpInst->getOpcode() == E->getAltOpcode()) {
Mask[i] = e + i;
} else {
Mask[i] = i;
propagateIRFlags(V0, OpScalars);
propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
return V;
llvm_unreachable("unknown inst");
return nullptr;
Value *BoUpSLP::vectorizeTree() {
ExtraValueToDebugLocsMap ExternallyUsedValues;
return vectorizeTree(ExternallyUsedValues);
Value *
BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
// If the vectorized tree can be rewritten in a smaller type, we truncate the
// vectorized root. InstCombine will then rewrite the entire expression. We
// sign extend the extracted values below.
auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
if (MinBWs.count(ScalarRoot)) {
if (auto *I = dyn_cast<Instruction>(VectorRoot))
auto BundleWidth = VectorizableTree[0]->Scalars.size();
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
VectorizableTree[0]->VectorizedValue = Trunc;
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");
// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
// specified by ScalarType.
auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
if (!MinBWs.count(ScalarRoot))
return Ex;
if (MinBWs[ScalarRoot].second)
return Builder.CreateSExt(Ex, ScalarType);
return Builder.CreateZExt(Ex, ScalarType);
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
llvm::User *User = ExternalUse.User;
// Skip users that we already RAUW. This happens when one instruction
// has multiple uses of the same value.
if (User && !is_contained(Scalar->users(), User))
TreeEntry *E = getTreeEntry(Scalar);
assert(E && "Invalid scalar");
assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
Value *Lane = Builder.getInt32(ExternalUse.Lane);
// If User == nullptr, the Scalar is used as extra arg. Generate
// ExtractElement instruction and update the record for this scalar in
// ExternallyUsedValues.
if (!User) {
assert(ExternallyUsedValues.count(Scalar) &&
"Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map");
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
} else {
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
auto &Locs = ExternallyUsedValues[Scalar];
ExternallyUsedValues.insert({Ex, Locs});
// Required to update internally referenced instructions.
// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
if (PHINode *PH = dyn_cast<PHINode>(User)) {
for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
if (PH->getIncomingValue(i) == Scalar) {
Instruction *IncomingTerminator =
if (isa<CatchSwitchInst>(IncomingTerminator)) {
} else {
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
PH->setOperand(i, Ex);
} else {
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
User->replaceUsesOfWith(Scalar, Ex);
} else {
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
User->replaceUsesOfWith(Scalar, Ex);
LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
// For each vectorized value:
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
if (Entry->State == TreeEntry::NeedToGather)
assert(Entry->VectorizedValue && "Can't find vectorizable value");
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
#ifndef NDEBUG
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
for (User *U : Scalar->users()) {
LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
// It is legal to delete users in the ignorelist.
assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
"Deleting out-of-tree value");
LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
return VectorizableTree[0]->VectorizedValue;
void BoUpSLP::optimizeGatherSequence() {
LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
for (Instruction *I : GatherSeq) {
if (isDeleted(I))
// Check if this block is inside a loop.
Loop *L = LI->getLoopFor(I->getParent());
if (!L)
// Check if it has a preheader.
BasicBlock *PreHeader = L->getLoopPreheader();
if (!PreHeader)
// If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't
// hoist this instruction.
auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
if (Op0 && L->contains(Op0))
if (Op1 && L->contains(Op1))
// We can hoist this instruction. Move it to the pre-header.
// Make a list of all reachable blocks in our CSE queue.
SmallVector<const DomTreeNode *, 8> CSEWorkList;
for (BasicBlock *BB : CSEBlocks)
if (DomTreeNode *N = DT->getNode(BB)) {
// Sort blocks by domination. This ensures we visit a block after all blocks
// dominating it are visited.
[this](const DomTreeNode *A, const DomTreeNode *B) {
return DT->properlyDominates(A, B);
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
"Worklist not sorted properly!");
BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
if (isDeleted(In))
if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
// Check if we can replace this instruction with any of the
// visited instructions.
for (Instruction *v : Visited) {
if (In->isIdenticalTo(v) &&
DT->dominates(v->getParent(), In->getParent())) {
In = nullptr;
if (In) {
assert(!is_contained(Visited, In));
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
Optional<BoUpSLP::ScheduleData *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
if (isa<PHINode>(S.OpValue))
return nullptr;
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
ScheduleData *PrevInBundle = nullptr;
ScheduleData *Bundle = nullptr;
bool ReSchedule = false;
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
if (!extendSchedulingRegion(V, S))
return None;
for (Value *V : VL) {
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)");
if (BundleMember->IsScheduled) {
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
// existing schedule.
LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
<< " was already scheduled\n");
ReSchedule = true;
assert(BundleMember->isSchedulingEntity() &&
"bundle member already part of other bundle");
if (PrevInBundle) {
PrevInBundle->NextInBundle = BundleMember;
} else {
Bundle = BundleMember;
BundleMember->UnscheduledDepsInBundle = 0;
Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
// Group the instructions to a bundle.
BundleMember->FirstInBundle = Bundle;
PrevInBundle = BundleMember;
if (ScheduleEnd != OldScheduleEnd) {
// The scheduling region got new instructions at the lower end (or it is a
// new region for the first bundle). This makes it necessary to
// recalculate all dependencies.
// It is seldom that this needs to be done a second time after adding the
// initial bundle to the region.
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
doForAllOpcodes(I, [](ScheduleData *SD) {
ReSchedule = true;
if (ReSchedule) {
assert(Bundle && "Failed to find schedule bundle");
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
<< BB->getName() << "\n");
calculateDependencies(Bundle, true, SLP);
// Now try to schedule the new bundle. As soon as the bundle is "ready" it
// means that there are no cyclic dependencies and we can schedule it.
// Note that's important that we don't "schedule" the bundle yet (see
// cancelScheduling).
while (!Bundle->isReady() && !ReadyInsts.empty()) {
ScheduleData *pickedSD = ReadyInsts.back();
if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
schedule(pickedSD, ReadyInsts);
if (!Bundle->isReady()) {
cancelScheduling(VL, S.OpValue);
return None;
return Bundle;
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
Value *OpValue) {
if (isa<PHINode>(OpValue))
ScheduleData *Bundle = getScheduleData(OpValue);
LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
"tried to unbundle something which is not a bundle");
// Un-bundle: make single instructions out of the bundle.
ScheduleData *BundleMember = Bundle;
while (BundleMember) {
assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
BundleMember->FirstInBundle = BundleMember;
ScheduleData *Next = BundleMember->NextInBundle;
BundleMember->NextInBundle = nullptr;
BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
if (BundleMember->UnscheduledDepsInBundle == 0) {
BundleMember = Next;
BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
// Allocate a new ScheduleData for the instruction.
if (ChunkPos >= ChunkSize) {
ChunkPos = 0;
return &(ScheduleDataChunks.back()[ChunkPos++]);
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
const InstructionsState &S) {
if (getScheduleData(V, isOneOf(S, V)))
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
ScheduleData *ISD = getScheduleData(I);
if (!ISD)
return false;
assert(isInSchedulingRegion(ISD) &&
"ScheduleData not in scheduling region");
ScheduleData *SD = allocateScheduleDataChunks();
SD->Inst = I;
SD->init(SchedulingRegionID, S.OpValue);
ExtraScheduleDataMap[I][S.OpValue] = SD;
return true;
if (CheckSheduleForI(I))
return true;
if (!ScheduleStart) {
// It's the first instruction in the new region.
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
if (isOneOf(S, I) != I)
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
BasicBlock::reverse_iterator UpIter =
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
while (true) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
if (UpIter != UpperEnd) {
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
if (isOneOf(S, I) != I)
LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
<< "\n");
return true;
if (DownIter != LowerEnd) {
if (&*DownIter == I) {
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
ScheduleEnd = I->getNextNode();
if (isOneOf(S, I) != I)
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
<< "\n");
return true;
assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
"instruction not found in block");
return true;
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
Instruction *ToI,
ScheduleData *PrevLoadStore,
ScheduleData *NextLoadStore) {
ScheduleData *CurrentLoadStore = PrevLoadStore;
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
ScheduleData *SD = ScheduleDataMap[I];
if (!SD) {
SD = allocateScheduleDataChunks();
ScheduleDataMap[I] = SD;
SD->Inst = I;
assert(!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region");
SD->init(SchedulingRegionID, I);
if (I->mayReadOrWriteMemory() &&
(!isa<IntrinsicInst>(I) ||
cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
// Update the linked list of memory accessing instructions.
if (CurrentLoadStore) {
CurrentLoadStore->NextLoadStore = SD;
} else {
FirstLoadStoreInRegion = SD;
CurrentLoadStore = SD;
if (NextLoadStore) {
if (CurrentLoadStore)
CurrentLoadStore->NextLoadStore = NextLoadStore;
} else {
LastLoadStoreInRegion = CurrentLoadStore;
void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
SmallVector<ScheduleData *, 10> WorkList;
while (!WorkList.empty()) {
ScheduleData *SD = WorkList.back();
ScheduleData *BundleMember = SD;
while (BundleMember) {
if (!BundleMember->hasValidDependencies()) {
LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
<< "\n");
BundleMember->Dependencies = 0;
// Handle def-use chain dependencies.
if (BundleMember->OpValue != BundleMember->Inst) {
ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
ScheduleData *DestBundle = UseSD->FirstInBundle;
if (!DestBundle->IsScheduled)
if (!DestBundle->hasValidDependencies())
} else {
for (User *U : BundleMember->Inst->users()) {
if (isa<Instruction>(U)) {
ScheduleData *UseSD = getScheduleData(U);
if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
ScheduleData *DestBundle = UseSD->FirstInBundle;
if (!DestBundle->IsScheduled)
if (!DestBundle->hasValidDependencies())
} else {
// I'm not sure if this can ever happen. But we need to be safe.
// This lets the instruction/bundle never be scheduled and
// eventually disable vectorization.
// Handle the memory dependencies.
ScheduleData *DepDest = BundleMember->NextLoadStore;
if (DepDest) {
Instruction *SrcInst = BundleMember->Inst;
MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
unsigned numAliased = 0;
unsigned DistToSrc = 1;
while (DepDest) {
// We have two limits to reduce the complexity:
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
// SLP->isAliased (which is the expensive part in this loop).
// 2) MaxMemDepDistance: It's for very large blocks and it aborts
// the whole loop (even if the loop is fast, it's quadratic).
// It's important for the loop break condition (see below) to
// check this limit even between two read-only instructions.
if (DistToSrc >= MaxMemDepDistance ||
((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
(numAliased >= AliasedCheckLimit ||
SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
// We increment the counter only if the locations are aliased
// (instead of counting all alias checks). This gives a better
// balance between reduced runtime and accurate dependencies.
ScheduleData *DestBundle = DepDest->FirstInBundle;
if (!DestBundle->IsScheduled) {
if (!DestBundle->hasValidDependencies()) {
DepDest = DepDest->NextLoadStore;
// Example, explaining the loop break condition: Let's assume our
// starting instruction is i0 and MaxMemDepDistance = 3.
// +--------v--v--v
// i0,i1,i2,i3,i4,i5,i6,i7,i8
// +--------^--^--^
// MaxMemDepDistance let us stop alias-checking at i3 and we add
// dependencies from i0 to i3,i4,.. (even if they are not aliased).
// Previously we already added dependencies from i3 to i6,i7,i8
// (because of MaxMemDepDistance). As we added a dependency from
// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
// and we can abort this loop at i6.
if (DistToSrc >= 2 * MaxMemDepDistance)
BundleMember = BundleMember->NextInBundle;
if (InsertInReadyList && SD->isReady()) {
LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
<< "\n");
void BoUpSLP::BlockScheduling::resetSchedule() {
assert(ScheduleStart &&
"tried to reset schedule on block which has not been scheduled");
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
doForAllOpcodes(I, [&](ScheduleData *SD) {
assert(isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region");
SD->IsScheduled = false;
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (!BS->ScheduleStart)
LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
// For the real scheduling we use a more sophisticated ready-list: it is
// sorted by the original instruction location. This lets the final schedule
// be as close as possible to the original instruction order.
struct ScheduleDataCompare {
bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
return SD2->SchedulingPriority < SD1->SchedulingPriority;
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
// Ensure that all dependency data is updated and fill the ready-list with
// initial instructions.
int Idx = 0;
int NumToSchedule = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
assert(SD->isPartOfBundle() ==
(getTreeEntry(SD->Inst) != nullptr) &&
"scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++;
if (SD->isSchedulingEntity()) {
BS->calculateDependencies(SD, false, this);
Instruction *LastScheduledInst = BS->ScheduleEnd;
// Do the "real" scheduling.
while (!ReadyInsts.empty()) {
ScheduleData *picked = *ReadyInsts.begin();
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
ScheduleData *BundleMember = picked;
while (BundleMember) {
Instruction *pickedInst = BundleMember->Inst;
if (LastScheduledInst->getNextNode() != pickedInst) {
LastScheduledInst = pickedInst;
BundleMember = BundleMember->NextInBundle;
BS->schedule(picked, ReadyInsts);
assert(NumToSchedule == 0 && "could not schedule all instructions");
// Avoid duplicate scheduling of the block.
BS->ScheduleStart = nullptr;
unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If V is a store, just return the width of the stored value without
// traversing the expression tree. This is the common case.
if (auto *Store = dyn_cast<StoreInst>(V))
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
auto E = InstrElementSize.find(V);
if (E != InstrElementSize.end())
return E->second;
// If V is not a store, we can traverse the expression tree to find loads
// that feed it. The type of the loaded value may indicate a more suitable
// width than V's type. We want to base the vector element size on the width
// of memory operations where possible.
SmallVector<Instruction *, 16> Worklist;
SmallPtrSet<Instruction *, 16> Visited;
if (auto *I = dyn_cast<Instruction>(V)) {
// Traverse the expression tree in bottom-up order looking for loads. If we
// encounter an instruction we don't yet handle, we give up.
auto MaxWidth = 0u;
auto FoundUnknownInst = false;
while (!Worklist.empty() && !FoundUnknownInst) {
auto *I = Worklist.pop_back_val();
// We should only be looking at scalar instructions here. If the current
// instruction has a vector type, give up.
auto *Ty = I->getType();
if (isa<VectorType>(Ty))
FoundUnknownInst = true;
// If the current instruction is a load, update MaxWidth to reflect the
// width of the loaded value.
else if (isa<LoadInst>(I))
MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
// Otherwise, we need to visit the operands of the instruction. We only
// handle the interesting cases from buildTree here. If an operand is an
// instruction we haven't yet visited, we add it to the worklist.
else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
if (Visited.insert(J).second)
// If we don't yet handle the instruction, give up.
FoundUnknownInst = true;
int Width = MaxWidth;
// If we didn't encounter a memory access in the expression tree, or if we
// gave up for some reason, just return the width of V. Otherwise, return the
// maximum width we found.
if (!MaxWidth || FoundUnknownInst)
Width = DL->getTypeSizeInBits(V->getType());
for (Instruction *I : Visited)
InstrElementSize[I] = Width;
return Width;
// Determine if a value V in a vectorizable expression Expr can be demoted to a
// smaller type with a truncation. We collect the values that will be demoted
// in ToDemote and additional roots that require investigating in Roots.
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
SmallVectorImpl<Value *> &ToDemote,
SmallVectorImpl<Value *> &Roots) {
// We can always demote constants.
if (isa<Constant>(V)) {
return true;
// If the value is not an instruction in the expression with only one use, it
// cannot be demoted.
auto *I = dyn_cast<Instruction>(V);
if (!I || !I->hasOneUse() || !Expr.count(I))
return false;
switch (I->getOpcode()) {
// We can always demote truncations and extensions. Since truncations can
// seed additional demotion, we save the truncated value.
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
// We can demote certain binary operations if we can demote both of their
// operands.
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
return false;
// We can demote selects if we can demote their true and false values.
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
return false;
// We can demote phis if we can demote all their incoming operands. Note that
// we don't need to worry about cycles since we ensure single use above.
case Instruction::PHI: {
PHINode *PN = cast<PHINode>(I);
for (Value *IncValue : PN->incoming_values())
if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
return false;
// Otherwise, conservatively give up.
return false;
// Record the value that we can demote.
return true;
void BoUpSLP::computeMinimumValueSizes() {
// If there are no external uses, the expression tree must be rooted by a
// store. We can't demote in-memory values, so there is nothing to do here.
if (ExternalUses.empty())
// We only attempt to truncate integer expressions.
auto &TreeRoot = VectorizableTree[0]->Scalars;
auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
if (!TreeRootIT)
// If the expression is not rooted by a store, these roots should have
// external uses. We will rely on InstCombine to rewrite the expression in
// the narrower type. However, InstCombine only rewrites single-use values.
// This means that if a tree entry other than a root is used externally, it
// must have multiple uses and InstCombine will not rewrite it. The code
// below ensures that only the roots are used externally.
SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
for (auto &EU : ExternalUses)
if (!Expr.erase(EU.Scalar))
if (!Expr.empty())
// Collect the scalar values of the vectorizable expression. We will use this
// context to determine which values can be demoted. If we see a truncation,
// we mark it as seeding another demotion.
for (auto &EntryPtr : VectorizableTree)
Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
// Ensure the roots of the vectorizable tree don't form a cycle. They must
// have a single external user that is not in the vectorizable tree.
for (auto *Root : TreeRoot)
if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
// Conservatively determine if we can actually truncate the roots of the
// expression. Collect the values that can be demoted in ToDemote and
// additional roots that require investigating in Roots.
SmallVector<Value *, 32> ToDemote;
SmallVector<Value *, 4> Roots;
for (auto *Root : TreeRoot)
if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
// The maximum bit width required to represent all the values that can be
// demoted without loss of precision. It would be safe to truncate the roots
// of the expression to this width.
auto MaxBitWidth = 8u;
// We first check if all the bits of the roots are demanded. If they're not,
// we can truncate the roots to this narrower type.
for (auto *Root : TreeRoot) {
auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
MaxBitWidth = std::max<unsigned>(
Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
// True if the roots can be zero-extended back to their original type, rather
// than sign-extended. We know that if the leading bits are not demanded, we
// can safely zero-extend. So we initialize IsKnownPositive to True.
bool IsKnownPositive = true;
// If all the bits of the roots are demanded, we can try a little harder to
// compute a narrower type. This can happen, for example, if the roots are
// getelementptr indices. InstCombine promotes these indices to the pointer
// width. Thus, all their bits are technically demanded even though the
// address computation might be vectorized in a smaller type.
// We start by looking at each entry that can be demoted. We compute the
// maximum bit width required to store the scalar by using ValueTracking to
// compute the number of high-order bits we can truncate.
if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
llvm::all_of(TreeRoot, [](Value *R) {
assert(R->hasOneUse() && "Root should have only one use!");
return isa<GetElementPtrInst>(R->user_back());
})) {
MaxBitWidth = 8u;
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
KnownBits Known = computeKnownBits(R, *DL);
return Known.isNonNegative();
// Determine the maximum number of bits required to store the scalar
// values.
for (auto *Scalar : ToDemote) {
auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
// If we can't prove that the sign bit is zero, we must add one to the
// maximum bit width to account for the unknown sign bit. This preserves
// the existing sign bit so we can safely sign-extend the root back to the
// original type. Otherwise, if we know the sign bit is zero, we will
// zero-extend the root instead.
// FIXME: This is somewhat suboptimal, as there will be cases where adding
// one to the maximum bit width will yield a larger-than-necessary
// type. In general, we need to add an extra bit only if we can't
// prove that the upper bit of the original type is equal to the
// upper bit of the proposed smaller type. If these two bits are the
// same (either zero or one) we know that sign-extending from the
// smaller type will result in the same value. Here, since we can't
// yet prove this, we are just making the proposed smaller type
// larger to ensure correctness.
if (!IsKnownPositive)
// Round MaxBitWidth up to the next power-of-two.
if (!isPowerOf2_64(MaxBitWidth))
MaxBitWidth = NextPowerOf2(MaxBitWidth);
// If the maximum bit width we compute is less than the with of the roots'
// type, we can proceed with the narrowing. Otherwise, do nothing.
if (MaxBitWidth >= TreeRootIT->getBitWidth())
// If we can truncate the root, we must collect additional values that might
// be demoted as a result. That is, those seeded by truncations we will
// modify.
while (!Roots.empty())
collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
// Finally, map the values we can demote to the maximum bit with we computed.
for (auto *Scalar : ToDemote)
MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
namespace {
/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {
SLPVectorizerPass Impl;
/// Pass identification, replacement for typeid
static char ID;
explicit SLPVectorizer() : FunctionPass(ID) {
bool doInitialization(Module &M) override {
return false;
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
void getAnalysisUsage(AnalysisUsage &AU) const override {
} // end anonymous namespace
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
auto *AA = &AM.getResult<AAManager>(F);
auto *LI = &AM.getResult<LoopAnalysis>(F);
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
return PA;
bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
LoopInfo *LI_, DominatorTree *DT_,
AssumptionCache *AC_, DemandedBits *DB_,
OptimizationRemarkEmitter *ORE_) {
if (!RunSLPVectorization)
return false;
SE = SE_;
AA = AA_;
LI = LI_;
DT = DT_;
AC = AC_;
DB = DB_;
DL = &F.getParent()->getDataLayout();
bool Changed = false;
// If the target claims to have no vector registers don't attempt
// vectorization.
if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
return false;
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {
// Vectorize trees that end at stores.
if (!Stores.empty()) {
LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
<< " underlying objects.\n");
Changed |= vectorizeStoreChains(R);
// Vectorize trees that end at reductions.
Changed |= vectorizeChainsInBlock(BB, R);
// Vectorize the index computations of getelementptr instructions. This
// is primarily intended to catch gather-like idioms ending at
// non-consecutive loads.
if (!GEPs.empty()) {
LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
<< " underlying objects.\n");
Changed |= vectorizeGEPIndices(BB, R);
if (Changed) {
LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
return Changed;
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned Idx) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
<< "\n");
const unsigned Sz = R.getVectorElementSize(Chain[0]);
const unsigned MinVF = R.getMinVecRegSize() / Sz;
unsigned VF = Chain.size();
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
return false;
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
<< "\n");
Optional<ArrayRef<unsigned>> Order = R.bestOrder();
// TODO: Handle orders of size less than number of elements in the vector.
if (Order && Order->size() == Chain.size()) {
// TODO: reorder tree nodes without tree rebuilding.
SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
llvm::transform(*Order, ReorderedOps.begin(),
[Chain](const unsigned Idx) { return Chain[Idx]; });
if (R.isTreeTinyAndNotFullyVectorizable())
return false;
if (R.isLoadCombineCandidate())
return false;
int Cost = R.getTreeCost();
LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
using namespace ore;
R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
<< " and with tree size "
<< NV("TreeSize", R.getTreeSize()));
return true;
return false;
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
int E = Stores.size();
SmallBitVector Tails(E, false);
SmallVector<int, 16> ConsecutiveChain(E, E + 1);
int MaxIter = MaxStoreLookup.getValue();
int IterCnt;
auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
&ConsecutiveChain](int K, int Idx) {
if (IterCnt >= MaxIter)
return true;
if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
return false;
ConsecutiveChain[K] = Idx;
return true;
// Do a quadratic search on all of the given stores in reverse order and find
// all of the pairs of stores that follow each other.
for (int Idx = E - 1; Idx >= 0; --Idx) {
// If a store has multiple consecutive store candidates, search according
// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
// This is because usually pairing with immediate succeeding or preceding
// candidate create the best chance to find slp vectorization opportunity.
const int MaxLookDepth = std::max(E - Idx, Idx + 1);
IterCnt = 0;
for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
// For stores that start but don't end a link in the chain:
for (int Cnt = E; Cnt > 0; --Cnt) {
int I = Cnt - 1;
if (ConsecutiveChain[I] == E + 1 || Tails.test(I))
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
// Collect the chain into a list.
while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
// Move to the next value in the chain.
I = ConsecutiveChain[I];
// If a vector register can't hold 1 element, we are done.
unsigned MaxVecRegSize = R.getMaxVecRegSize();
unsigned EltSize = R.getVectorElementSize(Stores[0]);
if (MaxVecRegSize % EltSize != 0)
unsigned MaxElts = MaxVecRegSize / EltSize;
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
unsigned StartIdx = 0;
for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
if (!VectorizedStores.count(Slice.front()) &&
!VectorizedStores.count(Slice.back()) &&
vectorizeStoreChain(Slice, R, Cnt)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Slice.begin(), Slice.end());
Changed = true;
// If we vectorized initial block, no need to try to vectorize it
// again.
if (Cnt == StartIdx)
StartIdx += Size;
Cnt += Size;
// Check if the whole array was vectorized already - exit.
if (StartIdx >= Operands.size())
return Changed;
void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
// Initialize the collections. We will make a single pass over the block.
// Visit the store and getelementptr instructions in BB and organize them in
// Stores and GEPs according to the underlying objects of their pointer
// operands.
for (Instruction &I : *BB) {
// Ignore store instructions that are volatile or have a pointer operand
// that doesn't point to a scalar type.
if (auto *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
if (!isValidElementType(SI->getValueOperand()->getType()))
Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
// Ignore getelementptr instructions that have more than one index, a
// constant index, or a pointer operand that doesn't point to a scalar
// type.
else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
auto Idx = GEP->idx_begin()->get();
if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
if (!isValidElementType(Idx->getType()))
if (GEP->getType()->isVectorTy())
bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = {A, B};
return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
bool AllowReorder,
ArrayRef<Value *> InsertUses) {
if (VL.size() < 2)
return false;
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n");
// Check that all of the parts are instructions of the same type,
// we permit an alternate opcode via InstructionsState.
InstructionsState S = getSameOpcode(VL);
if (!S.getOpcode())
return false;
Instruction *I0 = cast<Instruction>(S.OpValue);
// Make sure invalid types (including vector type) are rejected before
// determining vectorization factor for scalar instructions.
for (Value *V : VL) {
Type *Ty = V->getType();
if (!isValidElementType(Ty)) {
// NOTE: the following will give user internal llvm type name, which may
// not be useful.
R.getORE()->emit([&]() {
std::string type_str;
llvm::raw_string_ostream rso(type_str);
return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
<< "Cannot SLP vectorize list: type "
<< rso.str() + " is unsupported by vectorizer";
return false;
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
if (MaxVF < 2) {
R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
<< "Cannot SLP vectorize list: vectorization factor "
<< "less than 2 is not supported";
return false;
bool Changed = false;
bool CandidateFound = false;
int MinCost = SLPCostThreshold;
bool CompensateUseCost =
!InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
return V && isa<InsertElementInst>(V);
assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
"Each scalar expected to have an associated InsertElement user.");
unsigned NextInst = 0, MaxInst = VL.size();
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
// No actual vectorization should happen, if number of parts is the same as
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
if (TTI->getNumberOfParts(VecTy) == VF)
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned OpsWidth = 0;
if (I + VF > MaxInst)
OpsWidth = MaxInst - I;
OpsWidth = VF;
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
// Check that a previous iteration of this loop did not delete the Value.
if (llvm::any_of(Ops, [&R](Value *V) {
auto *I = dyn_cast<Instruction>(V);
return I && R.isDeleted(I);
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
<< "\n");
Optional<ArrayRef<unsigned>> Order = R.bestOrder();
// TODO: check if we can allow reordering for more cases.
if (AllowReorder && Order) {
// TODO: reorder tree nodes without tree rebuilding.
// Conceptually, there is nothing actually preventing us from trying to
// reorder a larger list. In fact, we do exactly this when vectorizing
// reductions. However, at this point, we only expect to get here when
// there are exactly two operations.
assert(Ops.size() == 2);
Value *ReorderedOps[] = {Ops[1], Ops[0]};
R.buildTree(ReorderedOps, None);
if (R.isTreeTinyAndNotFullyVectorizable())
int Cost = R.getTreeCost();
CandidateFound = true;
if (CompensateUseCost) {
// TODO: Use TTI's getScalarizationOverhead for sequence of inserts
// rather than sum of single inserts as the latter may overestimate
// cost. This work should imply improving cost estimation for extracts
// that added in for external (for vectorization tree) users,i.e. that
// part should also switch to same interface.
// For example, the following case is projected code after SLP:
// %4 = extractelement <4 x i64> %3, i32 0
// %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
// %5 = extractelement <4 x i64> %3, i32 1
// %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
// %6 = extractelement <4 x i64> %3, i32 2
// %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
// %7 = extractelement <4 x i64> %3, i32 3
// %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
// Extracts here added by SLP in order to feed users (the inserts) of
// original scalars and contribute to "ExtractCost" at cost evaluation.
// The inserts in turn form sequence to build an aggregate that
// detected by findBuildAggregate routine.
// SLP makes an assumption that such sequence will be optimized away
// later (instcombine) so it tries to compensate ExctractCost with
// cost of insert sequence.
// Current per element cost calculation approach is not quite accurate
// and tends to create bias toward favoring vectorization.
// Switching to the TTI interface might help a bit.
// Alternative solution could be pattern-match to detect a no-op or
// shuffle.
unsigned UserCost = 0;
for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
UserCost += TTI->getVectorInstrCost(
Instruction::InsertElement, IE->getType(), CI->getZExtValue());
LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
<< ".\n");
Cost -= UserCost;
MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
<< "SLP vectorized with cost " << ore::NV("Cost", Cost)
<< " and with tree size "
<< ore::NV("TreeSize", R.getTreeSize()));
// Move to the next bundle.
I += VF - 1;
NextInst = I + 1;
Changed = true;
if (!Changed && CandidateFound) {
R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
<< "List vectorization was possible but not beneficial with cost "
<< ore::NV("Cost", MinCost) << " >= "
<< ore::NV("Treshold", -SLPCostThreshold);
} else if (!Changed) {
R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
<< "Cannot SLP vectorize list: vectorization was impossible"
<< " with available vectorization factors";
return Changed;
bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!I)
return false;
if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
return false;
Value *P = I->getParent();
// Vectorize in current basic block only.
auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
return false;
// Try to vectorize V.
if (tryToVectorizePair(Op0, Op1, R))
return true;
auto *A = dyn_cast<BinaryOperator>(Op0);
auto *B = dyn_cast<BinaryOperator>(Op1);
// Try to skip B.
if (B && B->hasOneUse()) {
auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
return true;
if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
return true;
// Try to skip A.
if (A && A->hasOneUse()) {
auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
return true;
if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
return true;
return false;
/// Generate a shuffle mask to be used in a reduction tree.
/// \param VecLen The length of the vector to be reduced.
/// \param NumEltsToRdx The number of elements that should be reduced in the
/// vector.
/// \param IsPairwise Whether the reduction is a pairwise or splitting
/// reduction. A pairwise reduction will generate a mask of
/// <0,2,...> or <1,3,..> while a splitting reduction will generate
/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
/// \param IsLeft True will generate a mask of even elements, odd otherwise.
static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen,
unsigned NumEltsToRdx,
bool IsPairwise, bool IsLeft) {
assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
SmallVector<int, 32> ShuffleMask(VecLen, -1);
if (IsPairwise)
// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
for (unsigned i = 0; i != NumEltsToRdx; ++i)
ShuffleMask[i] = 2 * i + !IsLeft;
// Move the upper half of the vector to the lower half.
for (unsigned i = 0; i != NumEltsToRdx; ++i)
ShuffleMask[i] = NumEltsToRdx + i;
return ShuffleMask;
namespace {
/// Model horizontal reductions.
/// A horizontal reduction is a tree of reduction operations (currently add and
/// fadd) that has operations that can be put into a vector as its leaf.
/// For example, this tree:
/// mul mul mul mul
/// \ / \ /
/// + +
/// \ /
/// +
/// This tree has "mul" as its reduced values and "+" as its reduction
/// operations. A reduction might be feeding into a store or a binary operation
/// feeding a phi.
/// ...
/// \ /
/// +
/// |
/// phi +=
/// Or:
/// ...
/// \ /
/// +
/// |
/// *p =
class HorizontalReduction {
using ReductionOpsType = SmallVector<Value *, 16>;
using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
ReductionOpsListType ReductionOps;
SmallVector<Value *, 32> ReducedVals;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
/// Kind of the reduction data.
enum ReductionKind {
RK_None, /// Not a reduction.
RK_Arithmetic, /// Binary reduction data.
RK_Min, /// Minimum reduction data.
RK_UMin, /// Unsigned minimum reduction data.
RK_Max, /// Maximum reduction data.
RK_UMax, /// Unsigned maximum reduction data.
/// Contains info about operation, like its opcode, left and right operands.
class OperationData {
/// Opcode of the instruction.
unsigned Opcode = 0;
/// Left operand of the reduction operation.
Value *LHS = nullptr;
/// Right operand of the reduction operation.
Value *RHS = nullptr;
/// Kind of the reduction operation.
ReductionKind Kind = RK_None;
/// True if float point min/max reduction has no NaNs.
bool NoNaN = false;
/// Checks if the reduction operation can be vectorized.
bool isVectorizable() const {
return LHS && RHS &&
// We currently only support add/mul/logical && min/max reductions.
((Kind == RK_Arithmetic &&
(Opcode == Instruction::Add || Opcode == Instruction::FAdd ||
Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
Opcode == Instruction::And || Opcode == Instruction::Or ||
Opcode == Instruction::Xor)) ||
((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
(Kind == RK_Min || Kind == RK_Max)) ||
(Opcode == Instruction::ICmp &&
(Kind == RK_UMin || Kind == RK_UMax)));
/// Creates reduction operation with the current opcode.
Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
assert(isVectorizable() &&
"Expected add|fadd or min/max reduction operation.");
Value *Cmp = nullptr;
switch (Kind) {
case RK_Arithmetic:
return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
case RK_Min:
Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
: Builder.CreateFCmpOLT(LHS, RHS);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_Max:
Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
: Builder.CreateFCmpOGT(LHS, RHS);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_UMin:
assert(Opcode == Instruction::ICmp && "Expected integer types.");
Cmp = Builder.CreateICmpULT(LHS, RHS);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_UMax:
assert(Opcode == Instruction::ICmp && "Expected integer types.");
Cmp = Builder.CreateICmpUGT(LHS, RHS);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
case RK_None:
llvm_unreachable("Unknown reduction operation.");
explicit OperationData() = default;
/// Construction for reduced values. They are identified by opcode only and
/// don't have associated LHS/RHS values.
explicit OperationData(Value *V) {
if (auto *I = dyn_cast<Instruction>(V))
Opcode = I->getOpcode();
/// Constructor for reduction operations with opcode and its left and
/// right operands.
OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
bool NoNaN = false)
: Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
assert(Kind != RK_None && "One of the reduction operations is expected.");
explicit operator bool() const { return Opcode; }
/// Return true if this operation is any kind of minimum or maximum.
bool isMinMax() const {
switch (Kind) {
case RK_Arithmetic:
return false;
case RK_Min:
case RK_Max:
case RK_UMin:
case RK_UMax:
return true;
case RK_None:
llvm_unreachable("Reduction kind is not set");
/// Get the index of the first operand.
unsigned getFirstOperandIndex() const {
assert(!!*this && "The opcode is not set.");
// We allow calling this before 'Kind' is set, so handle that specially.
if (Kind == RK_None)
return 0;
return isMinMax() ? 1 : 0;
/// Total number of operands in the reduction operation.
unsigned getNumberOfOperands() const {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
return isMinMax() ? 3 : 2;
/// Checks if the operation has the same parent as \p P.
bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
if (!IsRedOp)
return I->getParent() == P;
if (isMinMax()) {
// SelectInst must be used twice while the condition op must have single
// use only.
auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
return I->getParent() == P && Cmp && Cmp->getParent() == P;
// Arithmetic reduction operation must be used once only.
return I->getParent() == P;
/// Expected number of uses for reduction operations/reduced values.
bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
if (isMinMax())
return I->hasNUses(2) &&
(!IsReductionOp ||
return I->hasOneUse();
/// Initializes the list of reduction operations.
void initReductionOps(ReductionOpsListType &ReductionOps) {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
if (isMinMax())
ReductionOps.assign(2, ReductionOpsType());
ReductionOps.assign(1, ReductionOpsType());
/// Add all reduction operations for the reduction instruction \p I.
void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
assert(Kind != RK_None && !!*this && LHS && RHS &&
"Expected reduction operation.");
if (isMinMax()) {
} else {
/// Checks if instruction is associative and can be vectorized.
bool isAssociative(Instruction *I) const {
assert(Kind != RK_None && *this && LHS && RHS &&
"Expected reduction operation.");
switch (Kind) {
case RK_Arithmetic:
return I->isAssociative();
case RK_Min:
case RK_Max:
return Opcode == Instruction::ICmp ||
case RK_UMin:
case RK_UMax:
assert(Opcode == Instruction::ICmp &&
"Only integer compare operation is expected.");
return true;
case RK_None:
llvm_unreachable("Reduction kind is not set");
/// Checks if the reduction operation can be vectorized.
bool isVectorizable(Instruction *I) const {
return isVectorizable() && isAssociative(I);
/// Checks if two operation data are both a reduction op or both a reduced
/// value.
bool operator==(const OperationData &OD) const {
assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
"One of the comparing operations is incorrect.");
return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
bool operator!=(const OperationData &OD) const { return !(*this == OD); }
void clear() {
Opcode = 0;
LHS = nullptr;
RHS = nullptr;
Kind = RK_None;
NoNaN = false;
/// Get the opcode of the reduction operation.
unsigned getOpcode() const {
assert(isVectorizable() && "Expected vectorizable operation.");
return Opcode;
/// Get kind of reduction data.
ReductionKind getKind() const { return Kind; }
Value *getLHS() const { return LHS; }
Value *getRHS() const { return RHS; }
Type *getConditionType() const {
return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;
/// Creates reduction operation with the current opcode with the IR flags
/// from \p ReductionOps.
Value *createOp(IRBuilder<> &Builder, const Twine &Name,
const ReductionOpsListType &ReductionOps) const {
assert(isVectorizable() &&
"Expected add|fadd or min/max reduction operation.");
auto *Op = createOp(Builder, Name);
switch (Kind) {
case RK_Arithmetic:
propagateIRFlags(Op, ReductionOps[0]);
return Op;
case RK_Min:
case RK_Max:
case RK_UMin:
case RK_UMax:
if (auto *SI = dyn_cast<SelectInst>(Op))
propagateIRFlags(SI->getCondition(), ReductionOps[0]);
propagateIRFlags(Op, ReductionOps[1]);
return Op;
case RK_None:
llvm_unreachable("Unknown reduction operation.");
/// Creates reduction operation with the current opcode with the IR flags
/// from \p I.
Value *createOp(IRBuilder<> &Builder, const Twine &Name,
Instruction *I) const {
assert(isVectorizable() &&
"Expected add|fadd or min/max reduction operation.");
auto *Op = createOp(Builder, Name);
switch (Kind) {
case RK_Arithmetic:
propagateIRFlags(Op, I);
return Op;
case RK_Min:
case RK_Max:
case RK_UMin:
case RK_UMax:
if (auto *SI = dyn_cast<SelectInst>(Op)) {
propagateIRFlags(Op, I);
return Op;
case RK_None:
llvm_unreachable("Unknown reduction operation.");
TargetTransformInfo::ReductionFlags getFlags() const {
TargetTransformInfo::ReductionFlags Flags;
Flags.NoNaN = NoNaN;
switch (Kind) {
case RK_Arithmetic:
case RK_Min:
Flags.IsSigned = Opcode == Instruction::ICmp;
Flags.IsMaxOp = false;
case RK_Max:
Flags.IsSigned = Opcode == Instruction::ICmp;
Flags.IsMaxOp = true;
case RK_UMin:
Flags.IsSigned = false;
Flags.IsMaxOp = false;
case RK_UMax:
Flags.IsSigned = false;
Flags.IsMaxOp = true;
case RK_None:
llvm_unreachable("Reduction kind is not set");
return Flags;
WeakTrackingVH ReductionRoot;
/// The operation data of the reduction operation.
OperationData ReductionData;
/// The operation data of the values we perform a reduction on.
OperationData ReducedValueData;
/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.
bool IsPairwiseReduction = false;
/// Checks if the ParentStackElem.first should be marked as a reduction
/// operation with an extra argument or as extra argument itself.
void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
Value *ExtraArg) {
if (ExtraArgs.count(ParentStackElem.first)) {
ExtraArgs[ParentStackElem.first] = nullptr;
// We ran into something like:
// ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
// The whole ParentStackElem.first should be considered as an extra value
// in this case.
// Do not perform analysis of remaining operands of ParentStackElem.first
// instruction, this whole instruction is an extra argument.
ParentStackElem.second = ParentStackElem.first->getNumOperands();
} else {
// We ran into something like:
// ParentStackElem.first += ... + ExtraArg + ...
ExtraArgs[ParentStackElem.first] = ExtraArg;
static OperationData getOperationData(Value *V) {
if (!V)
return OperationData();
Value *LHS;
Value *RHS;
if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
if (auto *Select = dyn_cast<SelectInst>(V)) {
// Look for a min/max pattern.
if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
} else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
} else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(
Instruction::FCmp, LHS, RHS, RK_Min,
} else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
} else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
} else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
return OperationData(
Instruction::FCmp, LHS, RHS, RK_Max,
} else {
// Try harder: look for min/max pattern based on instructions producing
// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
// During the intermediate stages of SLP, it's very common to have
// pattern like this (since optimizeGatherSequence is run only once
// at the end):
// %1 = extractelement <2 x i32> %a, i32 0
// %2 = extractelement <2 x i32> %a, i32 1
// %cond = icmp sgt i32 %1, %2
// %3 = extractelement <2 x i32> %a, i32 0
// %4 = extractelement <2 x i32> %a, i32 1
// %select = select i1 %cond, i32 %3, i32 %4
CmpInst::Predicate Pred;
Instruction *L1;
Instruction *L2;
LHS = Select->getTrueValue();
RHS = Select->getFalseValue();
Value *Cond = Select->getCondition();
// TODO: Support inverse predicates.
if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
if (!isa<ExtractElementInst>(RHS) ||
return OperationData(V);
} else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
if (!isa<ExtractElementInst>(LHS) ||
return OperationData(V);
} else {
if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
return OperationData(V);
if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
!L1->isIdenticalTo(cast<Instruction>(LHS)) ||
return OperationData(V);
switch (Pred) {
return OperationData(V);
case CmpInst::ICMP_ULT:
case CmpInst::ICMP_ULE:
return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
case CmpInst::ICMP_SLT:
case CmpInst::ICMP_SLE:
return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
case CmpInst::FCMP_OLT:
case CmpInst::FCMP_OLE:
case CmpInst::FCMP_ULT:
case CmpInst::FCMP_ULE:
return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
case CmpInst::ICMP_UGT:
case CmpInst::ICMP_UGE:
return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
case CmpInst::ICMP_SGT:
case CmpInst::ICMP_SGE:
return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
case CmpInst::FCMP_OGT:
case CmpInst::FCMP_OGE:
case CmpInst::FCMP_UGT:
case CmpInst::FCMP_UGE:
return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
return OperationData(V);
HorizontalReduction() = default;
/// Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
ReductionData = getOperationData(B);
// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
if (ReductionData.getLHS() == Phi) {
Phi = nullptr;
B = dyn_cast<Instruction>(ReductionData.getRHS());
ReductionData = getOperationData(B);
} else if (ReductionData.getRHS() == Phi) {
Phi = nullptr;
B = dyn_cast<Instruction>(ReductionData.getLHS());
ReductionData = getOperationData(B);
if (!ReductionData.isVectorizable(B))
return false;
Type *Ty = B->getType();
if (!isValidElementType(Ty))
return false;
if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
return false;
ReductionRoot = B;
// Post order traverse the reduction tree starting at B. We only handle true
// trees containing only binary operators.
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;
unsigned EdgeToVist = Stack.back().second++;
OperationData OpData = getOperationData(TreeN);
bool IsReducedValue = OpData != ReductionData;
// Postorder vist.
if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
if (IsReducedValue)
else {
auto I = ExtraArgs.find(TreeN);
if (I != ExtraArgs.end() && !I->second) {
// Check if TreeN is an extra argument of its parent operation.
if (Stack.size() <= 1) {
// TreeN can't be an extra argument as it is a root reduction
// operation.
return false;
// Yes, TreeN is an extra argument, do not add it to a list of
// reduction operations.
// Stack[Stack.size() - 2] always points to the parent operation.
markExtraArg(Stack[Stack.size() - 2], TreeN);
} else
ReductionData.addReductionOps(TreeN, ReductionOps);
// Retract.
// Visit left or right.
Value *NextV = TreeN->getOperand(EdgeToVist);
if (NextV != Phi) {
auto *I = dyn_cast<Instruction>(NextV);
OpData = getOperationData(I);
// Continue analysis if the next operand is a reduction operation or
// (possibly) a reduced value. If the reduced value opcode is not set,
// the first met operation != reduction operation is considered as the
// reduced value class.
if (I && (!ReducedValueData || OpData == ReducedValueData ||
OpData == ReductionData)) {
const bool IsReductionOperation = OpData == ReductionData;
// Only handle trees in the current basic block.
if (!ReductionData.hasSameParent(I, B->getParent(),
IsReductionOperation)) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
// Each tree node needs to have minimal number of users except for the
// ultimate reduction.
if (!ReductionData.hasRequiredNumberOfUses(I,
OpData == ReductionData) &&
I != B) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
if (IsReductionOperation) {
// We need to be able to reassociate the reduction operations.
if (!OpData.isAssociative(I)) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
} else if (ReducedValueData &&
ReducedValueData != OpData) {
// Make sure that the opcodes of the operations that we are going to
// reduce match.
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
} else if (!ReducedValueData)
ReducedValueData = OpData;
Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
// NextV is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), NextV);
return true;
/// Attempt to vectorize the tree found by
/// matchAssociativeReduction.
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
if (ReducedVals.empty())
return false;
// If there is a sufficient number of reduction values, reduce
// to a nearby power-of-2. Can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
unsigned NumReducedVals = ReducedVals.size();
if (NumReducedVals < 4)
return false;
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
Value *VectorizedTree = nullptr;
// FIXME: Fast-math-flags should be set based on the instructions in the
// reduction (not all of 'fast' are required).
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
FastMathFlags Unsafe;
unsigned i = 0;
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
// The same extra argument may be used several time, so log each attempt
// to use it.
for (auto &Pair : ExtraArgs) {
assert(Pair.first && "DebugLoc must be set.");
// The compare instruction of a min/max is the insertion point for new
// instructions and may be replaced with a new compare instruction.
auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
assert(isa<SelectInst>(RdxRootInst) &&
"Expected min/max reduction to have select root instruction");
Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
assert(isa<Instruction>(ScalarCond) &&
"Expected min/max reduction to have compare condition");
return cast<Instruction>(ScalarCond);
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
SmallVector<Value *, 16> IgnoreList;
for (auto &V : ReductionOps)
IgnoreList.append(V.begin(), V.end());
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
Optional<ArrayRef<unsigned>> Order = V.bestOrder();
// TODO: Handle orders of size less than number of elements in the vector.
if (Order && Order->size() == VL.size()) {
// TODO: reorder tree nodes without tree rebuilding.
SmallVector<Value *, 4> ReorderedOps(VL.size());
llvm::transform(*Order, ReorderedOps.begin(),
[VL](const unsigned Idx) { return VL[Idx]; });
V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
if (V.isTreeTinyAndNotFullyVectorizable())
if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
// Estimate cost.
int TreeCost = V.getTreeCost();
int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
int Cost = TreeCost + ReductionCost;
if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {
return OptimizationRemarkMissed(
SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
<< "Vectorizing horizontal reduction is possible"
<< "but not beneficial with cost "
<< ore::NV("Cost", Cost) << " and threshold "
<< ore::NV("Threshold", -SLPCostThreshold);
LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
return OptimizationRemark(
SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
<< "Vectorized horizontal reduction with cost "
<< ore::NV("Cost", Cost) << " and with tree size "
<< ore::NV("TreeSize", V.getTreeSize());
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
// Emit a reduction. For min/max, the root is a select, but the insertion
// point is the compare condition of that select.
Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
if (ReductionData.isMinMax())
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
if (VectorizedTree) {
OperationData VectReductionData(ReductionData.getOpcode(),
VectorizedTree, ReducedSubTree,
VectorizedTree =
VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
} else
VectorizedTree = ReducedSubTree;
i += ReduxWidth;
ReduxWidth = PowerOf2Floor(NumReducedVals - i);
if (VectorizedTree) {
// Finish the reduction.
for (; i < NumReducedVals; ++i) {
auto *I = cast<Instruction>(ReducedVals[i]);
OperationData VectReductionData(ReductionData.getOpcode(),
VectorizedTree, I,
VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
for (auto &Pair : ExternallyUsedValues) {
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
OperationData VectReductionData(ReductionData.getOpcode(),
VectorizedTree, Pair.first,
VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
// Update users. For a min/max reduction that ends with a compare and
// select, we also have to RAUW for the compare instruction feeding the
// reduction root. That's because the original compare may have extra uses
// besides the final select of the reduction.
if (ReductionData.isMinMax()) {
if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
Instruction *ScalarCmp =
// Mark all scalar reduction ops for deletion, they are replaced by the
// vector reductions.
return VectorizedTree != nullptr;
unsigned numReductionValues() const {
return ReducedVals.size();
/// Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth);
int PairwiseRdxCost;
int SplittingRdxCost;
switch (ReductionData.getKind()) {
case RK_Arithmetic:
PairwiseRdxCost =
TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
SplittingRdxCost =
TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
case RK_Min:
case RK_Max:
case RK_UMin:
case RK_UMax: {
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
ReductionData.getKind() == RK_UMax;
PairwiseRdxCost =
TTI->getMinMaxReductionCost(VecTy, VecCondTy,
/*IsPairwiseForm=*/true, IsUnsigned);
SplittingRdxCost =
TTI->getMinMaxReductionCost(VecTy, VecCondTy,
/*IsPairwiseForm=*/false, IsUnsigned);
case RK_None:
llvm_unreachable("Expected arithmetic or min/max reduction operation");
IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
int ScalarReduxCost = 0;
switch (ReductionData.getKind()) {
case RK_Arithmetic:
ScalarReduxCost =
TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
case RK_Min:
case RK_Max:
case RK_UMin:
case RK_UMax:
ScalarReduxCost =
TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
case RK_None:
llvm_unreachable("Expected arithmetic or min/max reduction operation");
ScalarReduxCost *= (ReduxWidth - 1);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal
<< " (It is a "
<< (IsPairwiseReduction ? "pairwise" : "splitting")
<< " reduction)\n");
return VecReduxCost - ScalarReduxCost;
/// Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
unsigned ReduxWidth, const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
if (!IsPairwiseReduction) {
// FIXME: The builder should use an FMF guard. It should not be hard-coded
// to 'fast'.
assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF");
return createSimpleTargetReduction(
Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
ReductionData.getFlags(), ReductionOps.back());
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true);
auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false);
Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
RightShuf, ReductionData.getKind());
TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
// The result is in the first element of the vector.
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
} // end anonymous namespace
/// Recognize construction of vectors like
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
/// starting from the last insertelement or insertvalue instruction.
/// Also recognize aggregates like {<2 x float>, <2 x float>},
/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
/// \return true if it matches.
static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
SmallVectorImpl<Value *> &BuildVectorOpds,
SmallVectorImpl<Value *> &InsertElts) {
assert((isa<InsertElementInst>(LastInsertInst) ||
isa<InsertValueInst>(LastInsertInst)) &&
"Expected insertelement or insertvalue instruction!");
do {
Value *InsertedOperand;
auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
if (IE) {
InsertedOperand = IE->getOperand(1);
LastInsertInst = IE->getOperand(0);
} else {
auto *IV = cast<InsertValueInst>(LastInsertInst);
InsertedOperand = IV->getInsertedValueOperand();
LastInsertInst = IV->getAggregateOperand();
if (isa<InsertElementInst>(InsertedOperand) ||
isa<InsertValueInst>(InsertedOperand)) {
SmallVector<Value *, 8> TmpBuildVectorOpds;
SmallVector<Value *, 8> TmpInsertElts;
if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
return false;
InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
} else {
if (isa<UndefValue>(LastInsertInst))
if ((!isa<InsertValueInst>(LastInsertInst) &&
!isa<InsertElementInst>(LastInsertInst)) ||
return false;
} while (true);
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
std::reverse(InsertElts.begin(), InsertElts.end());
return true;
static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
/// Try and get a reduction value from a phi node.
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
/// if they come from either \p ParentBB or a containing loop latch.
/// \returns A candidate reduction value if possible, or \code nullptr \endcode
/// if not possible.
static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
BasicBlock *ParentBB, LoopInfo *LI) {
// There are situations where the reduction value is not dominated by the
// reduction phi. Vectorizing such cases has been reported to cause
// miscompiles. See PR25787.
auto DominatedReduxValue = [&](Value *R) {
return isa<Instruction>(R) &&
DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
Value *Rdx = nullptr;
// Return the incoming value if it comes from the same BB as the phi node.
if (P->getIncomingBlock(0) == ParentBB) {
Rdx = P->getIncomingValue(0);
} else if (P->getIncomingBlock(1) == ParentBB) {
Rdx = P->getIncomingValue(1);
if (Rdx && DominatedReduxValue(Rdx))
return Rdx;
// Otherwise, check whether we have a loop latch to look at.
Loop *BBL = LI->getLoopFor(ParentBB);
if (!BBL)
return nullptr;
BasicBlock *BBLatch = BBL->getLoopLatch();
if (!BBLatch)
return nullptr;
// There is a loop latch, return the incoming value if it comes from
// that. This reduction pattern occasionally turns up.
if (P->getIncomingBlock(0) == BBLatch) {
Rdx = P->getIncomingValue(0);
} else if (P->getIncomingBlock(1) == BBLatch) {
Rdx = P->getIncomingValue(1);
if (Rdx && DominatedReduxValue(Rdx))
return Rdx;
return nullptr;
/// Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding the phi node \a P
/// with reduction operators \a Root (or one of its operands) in a basic block
/// \a BB, then check if it can be done. If horizontal reduction is not found
/// and root instruction is a binary operation, vectorization of the operands is
/// attempted.
/// \returns true if a horizontal reduction was matched and reduced or operands
/// of one of the binary instruction were vectorized.
/// \returns false if a horizontal reduction was not matched (or not possible)
/// or no vectorization of any binary operation feeding \a Root instruction was
/// performed.
static bool tryToVectorizeHorReductionOrInstOperands(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI,
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
if (!Root)
return false;
if (Root->getParent() != BB || isa<PHINode>(Root))
return false;
// Start analysis starting from Root instruction. If horizontal reduction is
// found, try to vectorize it. If it is not a horizontal reduction or
// vectorization is not possible or not effective, and currently analyzed
// instruction is a binary operation, try to vectorize the operands, using
// pre-order DFS traversal order. If the operands were not vectorized, repeat
// the same procedure considering each operand as a possible root of the
// horizontal reduction.
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
while (!Stack.empty()) {
Instruction *Inst;
unsigned Level;
std::tie(Inst, Level) = Stack.pop_back_val();
auto *BI = dyn_cast<BinaryOperator>(Inst);
auto *SI = dyn_cast<SelectInst>(Inst);
if (BI || SI) {
HorizontalReduction HorRdx;
if (HorRdx.matchAssociativeReduction(P, Inst)) {
if (HorRdx.tryToReduce(R, TTI)) {
Res = true;
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
P = nullptr;
if (P && BI) {
Inst = dyn_cast<Instruction>(BI->getOperand(0));
if (Inst == P)
Inst = dyn_cast<Instruction>(BI->getOperand(1));
if (!Inst) {
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
P = nullptr;
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
P = nullptr;
if (Vectorize(Inst, R)) {
Res = true;
// Try to vectorize operands.
// Continue analysis for the instruction from the same basic block only to
// save compile time.
if (++Level < RecursionMaxDepth)
for (auto *Op : Inst->operand_values())
if (VisitedInstrs.insert(Op).second)
if (auto *I = dyn_cast<Instruction>(Op))
if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
Stack.emplace_back(I, Level);
return Res;
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI) {
if (!V)
return false;
auto *I = dyn_cast<Instruction>(V);
if (!I)
return false;
if (!isa<BinaryOperator>(I))
P = nullptr;
// Try to match and vectorize a horizontal reduction.
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
return tryToVectorize(I, R);
return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
BasicBlock *BB, BoUpSLP &R) {
const DataLayout &DL = BB->getModule()->getDataLayout();
if (!R.canMapToVector(IVI->getType(), DL))
return false;
SmallVector<Value *, 16> BuildVectorOpds;
SmallVector<Value *, 16> BuildVectorInsts;
if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
BuildVectorOpds.size() < 2)
return false;
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
SmallVector<Value *, 16> BuildVectorInsts;
SmallVector<Value *, 16> BuildVectorOpds;
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
BuildVectorOpds.size() < 2 ||
[](Value *V) { return isa<ExtractElementInst>(V); }) &&
return false;
// Vectorize starting with the build vector operands ignoring the BuildVector
// instructions for the purpose of scheduling and user extraction.
return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
BoUpSLP &R) {
if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
return true;
bool OpsChanged = false;
for (int Idx = 0; Idx < 2; ++Idx) {
OpsChanged |=
vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
return OpsChanged;
bool SLPVectorizerPass::vectorizeSimpleInstructions(
SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
bool OpsChanged = false;
for (auto *I : reverse(Instructions)) {
if (R.isDeleted(I))
if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
else if (auto *CI = dyn_cast<CmpInst>(I))
OpsChanged |= vectorizeCmpInst(CI, BB, R);
return OpsChanged;
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallPtrSet<Value *, 16> VisitedInstrs;
unsigned MaxVecRegSize = R.getMaxVecRegSize();
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
HaveVectorizedPhiNodes = false;
// Collect the incoming values from the PHIs.
for (Instruction &I : *BB) {
PHINode *P = dyn_cast<PHINode>(&I);
if (!P)
if (!VisitedInstrs.count(P) && !R.isDeleted(P))
// Sort by type.
llvm::stable_sort(Incoming, PhiTypeSorterFunc);
// Try to vectorize elements base on their type.
for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
E = Incoming.end();
IncIt != E;) {
// Look for the next elements with the same type.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
Type *EltTy = (*IncIt)->getType();
- unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy)
- : MaxVecRegSize;
+ assert(EltTy->isSized() &&
+ "Instructions should all be sized at this point");
+ TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
+ if (EltTS.isScalable()) {
+ // For now, just ignore vectorizing scalable types.
+ ++IncIt;
+ continue;
+ }
+ unsigned EltSize = EltTS.getFixedSize();
unsigned MaxNumElts = MaxVecRegSize / EltSize;
if (MaxNumElts < 2) {
while (SameTypeIt != E &&
(*SameTypeIt)->getType() == EltTy &&
(SameTypeIt - IncIt) < MaxNumElts) {
// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
<< NumElts << ")\n");
// The order in which the phi nodes appear in the program does not matter.
// So allow tryToVectorizeList to reorder them if it is beneficial. This
// is done when there are exactly two elements since tryToVectorizeList
// asserts that there are only two values when AllowReorder is true.
bool AllowReorder = NumElts == 2;
if (NumElts > 1 &&
tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
// Start over at the next instruction of a different type (or the end).
IncIt = SameTypeIt;
SmallVector<Instruction *, 8> PostProcessInstructions;
SmallDenseSet<Instruction *, 4> KeyNodes;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
// Skip instructions marked for the deletion.
if (R.isDeleted(&*it))
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second) {
if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
it = BB->begin();
e = BB->end();
if (isa<DbgInfoIntrinsic>(it))
// Try to vectorize reductions that use PHINodes.
if (PHINode *P = dyn_cast<PHINode>(it)) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() != 2)
return Changed;
// Try to match and vectorize a horizontal reduction.
if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
TTI)) {
Changed = true;
it = BB->begin();
e = BB->end();
// Ran into an instruction without users, like terminator, or function call
// with ignored return value, store. Ignore unused instructions (basing on
// instruction type, except for CallInst and InvokeInst).
if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
isa<InvokeInst>(it))) {
bool OpsChanged = false;
if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
for (auto *V : it->operand_values()) {
// Try to match and vectorize a horizontal reduction.
OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
// Start vectorization of post-process list of instructions from the
// top-tree instructions to try to vectorize as many instructions as
// possible.
OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
it = BB->begin();
e = BB->end();
if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
return Changed;
bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
auto Changed = false;
for (auto &Entry : GEPs) {
// If the getelementptr list has fewer than two elements, there's nothing
// to do.
if (Entry.second.size() < 2)
LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n");
// Process the GEP list in chunks suitable for the target's supported
// vector size. If a vector register can't hold 1 element, we are done. We
// are trying to vectorize the index computations, so the maximum number of
// elements is based on the size of the index expression, rather than the
// size of the GEP itself (the target's pointer size).
unsigned MaxVecRegSize = R.getMaxVecRegSize();
unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
if (MaxVecRegSize < EltSize)
unsigned MaxElts = MaxVecRegSize / EltSize;
for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
auto Len = std::min<unsigned>(BE - BI, MaxElts);
auto GEPList = makeArrayRef(&Entry.second[BI], Len);
// Initialize a set a candidate getelementptrs. Note that we use a
// SetVector here to preserve program order. If the index computations
// are vectorizable and begin with loads, we want to minimize the chance
// of having to reorder them later.
SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
// Some of the candidates may have already been vectorized after we
// initially collected them. If so, they are marked as deleted, so remove
// them from the set of candidates.
[&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
// Remove from the set of candidates all pairs of getelementptrs with
// constant differences. Such getelementptrs are likely not good
// candidates for vectorization in a bottom-up phase since one can be
// computed from the other. We also ensure all candidate getelementptr
// indices are unique.
for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
auto *GEPI = GEPList[I];
if (!Candidates.count(GEPI))
auto *SCEVI = SE->getSCEV(GEPList[I]);
for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
auto *GEPJ = GEPList[J];
auto *SCEVJ = SE->getSCEV(GEPList[J]);
if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
// We break out of the above computation as soon as we know there are
// fewer than two candidates remaining.
if (Candidates.size() < 2)
// Add the single, non-constant index of each candidate to the bundle. We
// ensured the indices met these constraints when we originally collected
// the getelementptrs.
SmallVector<Value *, 16> Bundle(Candidates.size());
auto BundleIndex = 0u;
for (auto *V : Candidates) {
auto *GEP = cast<GetElementPtrInst>(V);
auto *GEPIdx = GEP->idx_begin()->get();
assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
Bundle[BundleIndex++] = GEPIdx;
// Try and vectorize the indices. We are currently only interested in
// gather-like cases of the form:
// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
// where the loads of "a", the loads of "b", and the subtractions can be
// performed in parallel. It's likely that detecting this pattern in a
// bottom-up phase will be simpler and less costly than building a
// full-blown top-down phase beginning at the consecutive loads.
Changed |= tryToVectorizeList(Bundle, R);
return Changed;
bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
bool Changed = false;
// Attempt to sort and vectorize each of the store-groups.
for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
++it) {
if (it->second.size() < 2)
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n");
Changed |= vectorizeStores(it->second, R);
return Changed;
char SLPVectorizer::ID = 0;
static const char lv_name[] = "SLP Vectorizer";
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index ab57907e088e..b4b0dea0d1af 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -1,1488 +1,1488 @@
* kmp_ftn_entry.h -- Fortran entry linkage support for OpenMP.
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#error The support file kmp_ftn_entry.h should not be compiled by itself.
#ifdef KMP_STUB
#include "kmp_stub.h"
#include "kmp_i18n.h"
// For affinity format functions
#include "kmp_io.h"
#include "kmp_str.h"
#include "ompt-specific.h"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/* For compatibility with the Gnu/MS Open MP codegen, omp_set_num_threads(),
* omp_set_nested(), and omp_set_dynamic() [in lowercase on MS, and w/o
* a trailing underscore on Linux* OS] take call by value integer arguments.
* + omp_set_max_active_levels()
* + omp_set_schedule()
* For backward compatibility with 9.1 and previous Intel compiler, these
* entry points take call by reference integer arguments. */
// This macro helps to reduce code duplication.
#define KMP_DEREF
#define KMP_DEREF *
#ifdef KMP_STUB
__kmps_set_stacksize(KMP_DEREF arg);
// __kmp_aux_set_stacksize initializes the library if needed
__kmp_aux_set_stacksize((size_t)KMP_DEREF arg);
#ifdef KMP_STUB
__kmps_set_stacksize(KMP_DEREF arg);
// __kmp_aux_set_stacksize initializes the library if needed
__kmp_aux_set_stacksize(KMP_DEREF arg);
#ifdef KMP_STUB
return __kmps_get_stacksize();
if (!__kmp_init_serial) {
return (int)__kmp_stksize;
#ifdef KMP_STUB
return __kmps_get_stacksize();
if (!__kmp_init_serial) {
return __kmp_stksize;
#ifdef KMP_STUB
__kmps_set_blocktime(KMP_DEREF arg);
int gtid, tid;
kmp_info_t *thread;
gtid = __kmp_entry_gtid();
tid = __kmp_tid_from_gtid(gtid);
thread = __kmp_thread_from_gtid(gtid);
__kmp_aux_set_blocktime(KMP_DEREF arg, thread, tid);
#ifdef KMP_STUB
return __kmps_get_blocktime();
int gtid, tid;
kmp_info_t *thread;
kmp_team_p *team;
gtid = __kmp_entry_gtid();
tid = __kmp_tid_from_gtid(gtid);
thread = __kmp_thread_from_gtid(gtid);
team = __kmp_threads[gtid]->th.th_team;
/* These must match the settings used in __kmp_wait_sleep() */
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
team->t.t_id, tid, KMP_MAX_BLOCKTIME));
else if (__kmp_zero_bt && !get__bt_set(team, tid)) {
KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
team->t.t_id, tid, 0));
return 0;
else {
KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
team->t.t_id, tid, get__blocktime(team, tid)));
return get__blocktime(team, tid);
#ifdef KMP_STUB
// __kmp_user_set_library initializes the library if needed
#ifdef KMP_STUB
// __kmp_user_set_library initializes the library if needed
#ifdef KMP_STUB
// __kmp_user_set_library initializes the library if needed
#ifdef KMP_STUB
__kmps_set_library(KMP_DEREF arg);
enum library_type lib;
lib = (enum library_type)KMP_DEREF arg;
// __kmp_user_set_library initializes the library if needed
#ifdef KMP_STUB
return __kmps_get_library();
if (!__kmp_init_serial) {
return ((int)__kmp_library);
#ifdef KMP_STUB
; // empty routine
// ignore after initialization because some teams have already
// allocated dispatch buffers
if (__kmp_init_serial == 0 && (KMP_DEREF arg) > 0)
__kmp_dispatch_num_buffers = KMP_DEREF arg;
return -1;
if (!TCR_4(__kmp_init_middle)) {
return __kmp_aux_set_affinity(mask);
return -1;
if (!TCR_4(__kmp_init_middle)) {
return __kmp_aux_get_affinity(mask);
return 0;
// We really only NEED serial initialization here.
if (!TCR_4(__kmp_init_middle)) {
return __kmp_aux_get_affinity_max_proc();
*mask = NULL;
// We really only NEED serial initialization here.
kmp_affin_mask_t *mask_internals;
if (!TCR_4(__kmp_init_middle)) {
mask_internals = __kmp_affinity_dispatch->allocate_mask();
*mask = mask_internals;
// Nothing
// We really only NEED serial initialization here.
kmp_affin_mask_t *mask_internals;
if (!TCR_4(__kmp_init_middle)) {
if (__kmp_env_consistency_check) {
if (*mask == NULL) {
KMP_FATAL(AffinityInvalidMask, "kmp_destroy_affinity_mask");
mask_internals = (kmp_affin_mask_t *)(*mask);
*mask = NULL;
return -1;
if (!TCR_4(__kmp_init_middle)) {
return __kmp_aux_set_affinity_mask_proc(KMP_DEREF proc, mask);
return -1;
if (!TCR_4(__kmp_init_middle)) {
return __kmp_aux_unset_affinity_mask_proc(KMP_DEREF proc, mask);
return -1;
if (!TCR_4(__kmp_init_middle)) {
return __kmp_aux_get_affinity_mask_proc(KMP_DEREF proc, mask);
/* ------------------------------------------------------------------------ */
/* sets the requested number of threads for the next parallel region */
#ifdef KMP_STUB
// Nothing.
__kmp_set_num_threads(KMP_DEREF arg, __kmp_entry_gtid());
/* returns the number of threads in current team */
#ifdef KMP_STUB
return 1;
// __kmpc_bound_num_threads initializes the library if needed
return __kmpc_bound_num_threads(NULL);
#ifdef KMP_STUB
return 1;
int gtid;
kmp_info_t *thread;
if (!TCR_4(__kmp_init_middle)) {
gtid = __kmp_entry_gtid();
thread = __kmp_threads[gtid];
// return thread -> th.th_team -> t.t_current_task[
// thread->th.th_info.ds.ds_tid ] -> icvs.nproc;
return thread->th.th_current_task->td_icvs.nproc;
int FTN_STDCALL FTN_CONTROL_TOOL(int command, int modifier, void *arg) {
#if defined(KMP_STUB) || !OMPT_SUPPORT
return -2;
if (!TCR_4(__kmp_init_middle)) {
return -2;
kmp_info_t *this_thr = __kmp_threads[__kmp_entry_gtid()];
ompt_task_info_t *parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
int ret = __kmp_control_tool(command, modifier, arg);
parent_task_info->frame.enter_frame.ptr = 0;
return ret;
/* OpenMP 5.0 Memory Management support */
omp_allocator_handle_t FTN_STDCALL
FTN_INIT_ALLOCATOR(omp_memspace_handle_t KMP_DEREF m, int KMP_DEREF ntraits,
omp_alloctrait_t tr[]) {
#ifdef KMP_STUB
return NULL;
return __kmpc_init_allocator(__kmp_entry_gtid(), KMP_DEREF m,
KMP_DEREF ntraits, tr);
void FTN_STDCALL FTN_DESTROY_ALLOCATOR(omp_allocator_handle_t al) {
#ifndef KMP_STUB
__kmpc_destroy_allocator(__kmp_entry_gtid(), al);
void FTN_STDCALL FTN_SET_DEFAULT_ALLOCATOR(omp_allocator_handle_t al) {
#ifndef KMP_STUB
__kmpc_set_default_allocator(__kmp_entry_gtid(), al);
omp_allocator_handle_t FTN_STDCALL FTN_GET_DEFAULT_ALLOCATOR(void) {
#ifdef KMP_STUB
return NULL;
return __kmpc_get_default_allocator(__kmp_entry_gtid());
/* OpenMP 5.0 affinity format support */
#ifndef KMP_STUB
static void __kmp_fortran_strncpy_truncate(char *buffer, size_t buf_size,
char const *csrc, size_t csrc_size) {
size_t capped_src_size = csrc_size;
if (csrc_size >= buf_size) {
capped_src_size = buf_size - 1;
KMP_STRNCPY_S(buffer, buf_size, csrc, capped_src_size);
if (csrc_size >= buf_size) {
KMP_DEBUG_ASSERT(buffer[buf_size - 1] == '\0');
buffer[buf_size - 1] = csrc[buf_size - 1];
} else {
for (size_t i = csrc_size; i < buf_size; ++i)
buffer[i] = ' ';
// Convert a Fortran string to a C string by adding null byte
class ConvertedString {
char *buf;
kmp_info_t *th;
ConvertedString(char const *fortran_str, size_t size) {
th = __kmp_get_thread();
buf = (char *)__kmp_thread_malloc(th, size + 1);
KMP_STRNCPY_S(buf, size + 1, fortran_str, size);
buf[size] = '\0';
~ConvertedString() { __kmp_thread_free(th, buf); }
const char *get() const { return buf; }
#endif // KMP_STUB
* Set the value of the affinity-format-var ICV on the current device to the
* format specified in the argument.
void FTN_STDCALL FTN_SET_AFFINITY_FORMAT(char const *format, size_t size) {
#ifdef KMP_STUB
if (!__kmp_init_serial) {
ConvertedString cformat(format, size);
// Since the __kmp_affinity_format variable is a C string, do not
// use the fortran strncpy function
__kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
cformat.get(), KMP_STRLEN(cformat.get()));
* Returns the number of characters required to hold the entire affinity format
* specification (not including null byte character) and writes the value of the
* affinity-format-var ICV on the current device to buffer. If the return value
* is larger than size, the affinity format specification is truncated.
size_t FTN_STDCALL FTN_GET_AFFINITY_FORMAT(char *buffer, size_t size) {
#ifdef KMP_STUB
return 0;
size_t format_size;
if (!__kmp_init_serial) {
format_size = KMP_STRLEN(__kmp_affinity_format);
if (buffer && size) {
__kmp_fortran_strncpy_truncate(buffer, size, __kmp_affinity_format,
return format_size;
* Prints the thread affinity information of the current thread in the format
* specified by the format argument. If the format is NULL or a zero-length
* string, the value of the affinity-format-var ICV is used.
void FTN_STDCALL FTN_DISPLAY_AFFINITY(char const *format, size_t size) {
#ifdef KMP_STUB
int gtid;
if (!TCR_4(__kmp_init_middle)) {
gtid = __kmp_get_gtid();
ConvertedString cformat(format, size);
__kmp_aux_display_affinity(gtid, cformat.get());
* Returns the number of characters required to hold the entire affinity format
* specification (not including null byte) and prints the thread affinity
* information of the current thread into the character string buffer with the
* size of size in the format specified by the format argument. If the format is
* NULL or a zero-length string, the value of the affinity-format-var ICV is
* used. The buffer must be allocated prior to calling the routine. If the
* return value is larger than size, the affinity format specification is
* truncated.
size_t FTN_STDCALL FTN_CAPTURE_AFFINITY(char *buffer, char const *format,
size_t buf_size, size_t for_size) {
#if defined(KMP_STUB)
return 0;
int gtid;
size_t num_required;
kmp_str_buf_t capture_buf;
if (!TCR_4(__kmp_init_middle)) {
gtid = __kmp_get_gtid();
ConvertedString cformat(format, for_size);
num_required = __kmp_aux_capture_affinity(gtid, cformat.get(), &capture_buf);
if (buffer && buf_size) {
__kmp_fortran_strncpy_truncate(buffer, buf_size, capture_buf.str,
return num_required;
#ifdef KMP_STUB
return 0;
int gtid;
gtid = __kmp_entry_gtid();
if (!__kmp_init_parallel ||
(gtid = (int)((kmp_intptr_t)TlsGetValue(__kmp_gtid_threadprivate_key))) ==
0) {
// Either library isn't initialized or thread is not registered
// 0 is the correct TID in this case
return 0;
--gtid; // We keep (gtid+1) in TLS
if (__kmp_gtid_mode >= 3) {
if ((gtid = __kmp_gtid) == KMP_GTID_DNE) {
return 0;
} else {
if (!__kmp_init_parallel ||
(gtid = (kmp_intptr_t)(
pthread_getspecific(__kmp_gtid_threadprivate_key))) == 0) {
return 0;
#error Unknown or unsupported OS
return __kmp_tid_from_gtid(gtid);
#ifdef KMP_STUB
return 1;
if (!__kmp_init_serial) {
/* NOTE: this is not syncronized, so it can change at any moment */
/* NOTE: this number also includes threads preallocated in hot-teams */
return TCR_4(__kmp_nth);
#ifdef KMP_STUB
return 1;
if (!TCR_4(__kmp_init_middle)) {
return __kmp_avail_proc;
KMP_INFORM(APIDeprecated, "omp_set_nested", "omp_set_max_active_levels");
#ifdef KMP_STUB
__kmps_set_nested(KMP_DEREF flag);
kmp_info_t *thread;
/* For the thread-private internal controls implementation */
thread = __kmp_entry_thread();
// Somewhat arbitrarily decide where to get a value for max_active_levels
int max_active_levels = get__max_active_levels(thread);
if (max_active_levels == 1)
max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
set__max_active_levels(thread, (KMP_DEREF flag) ? max_active_levels : 1);
KMP_INFORM(APIDeprecated, "omp_get_nested", "omp_get_max_active_levels");
#ifdef KMP_STUB
return __kmps_get_nested();
kmp_info_t *thread;
thread = __kmp_entry_thread();
return get__max_active_levels(thread) > 1;
#ifdef KMP_STUB
__kmps_set_dynamic(KMP_DEREF flag ? TRUE : FALSE);
kmp_info_t *thread;
/* For the thread-private implementation of the internal controls */
thread = __kmp_entry_thread();
// !!! What if foreign thread calls it?
set__dynamic(thread, KMP_DEREF flag ? TRUE : FALSE);
#ifdef KMP_STUB
return __kmps_get_dynamic();
kmp_info_t *thread;
thread = __kmp_entry_thread();
return get__dynamic(thread);
#ifdef KMP_STUB
return 0;
kmp_info_t *th = __kmp_entry_thread();
if (th->th.th_teams_microtask) {
// AC: r_in_parallel does not work inside teams construct where real
// parallel is inactive, but all threads have same root, so setting it in
// one team affects other teams.
// The solution is to use per-team nesting level
return (th->th.th_team->t.t_active_level ? 1 : 0);
} else
return (th->th.th_root->r.r_in_parallel ? FTN_TRUE : FTN_FALSE);
int KMP_DEREF modifier) {
#ifdef KMP_STUB
__kmps_set_schedule(KMP_DEREF kind, KMP_DEREF modifier);
/* TO DO: For the per-task implementation of the internal controls */
__kmp_set_schedule(__kmp_entry_gtid(), KMP_DEREF kind, KMP_DEREF modifier);
int *modifier) {
#ifdef KMP_STUB
__kmps_get_schedule(kind, modifier);
/* TO DO: For the per-task implementation of the internal controls */
__kmp_get_schedule(__kmp_entry_gtid(), kind, modifier);
#ifdef KMP_STUB
// Nothing.
/* TO DO: We want per-task implementation of this internal control */
__kmp_set_max_active_levels(__kmp_entry_gtid(), KMP_DEREF arg);
#ifdef KMP_STUB
return 0;
/* TO DO: We want per-task implementation of this internal control */
return __kmp_get_max_active_levels(__kmp_entry_gtid());
#ifdef KMP_STUB
return 0; // returns 0 if it is called from the sequential part of the program
/* TO DO: For the per-task implementation of the internal controls */
return __kmp_entry_thread()->th.th_team->t.t_active_level;
#ifdef KMP_STUB
return 0; // returns 0 if it is called from the sequential part of the program
/* TO DO: For the per-task implementation of the internal controls */
return __kmp_entry_thread()->th.th_team->t.t_level;
#ifdef KMP_STUB
return (KMP_DEREF level) ? (-1) : (0);
return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), KMP_DEREF level);
#ifdef KMP_STUB
return (KMP_DEREF level) ? (-1) : (1);
return __kmp_get_team_size(__kmp_entry_gtid(), KMP_DEREF level);
#ifdef KMP_STUB
return 1; // TO DO: clarify whether it returns 1 or 0?
int gtid;
kmp_info_t *thread;
if (!__kmp_init_serial) {
gtid = __kmp_entry_gtid();
thread = __kmp_threads[gtid];
return thread->th.th_current_task->td_icvs.thread_limit;
#ifdef KMP_STUB
return 0; // TO DO: clarify whether it returns 1 or 0?
if (!TCR_4(__kmp_init_parallel)) {
return 0;
return __kmp_entry_thread()->th.th_current_task->;
#ifdef KMP_STUB
return __kmps_get_proc_bind();
return get__proc_bind(__kmp_entry_thread());
return 0;
if (!TCR_4(__kmp_init_middle)) {
return 0;
return __kmp_affinity_num_masks;
return 0;
int i;
int retval = 0;
if (!TCR_4(__kmp_init_middle)) {
return 0;
if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
return 0;
kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
(!KMP_CPU_ISSET(i, mask))) {
return retval;
int *ids) {
// Nothing.
int i, j;
if (!TCR_4(__kmp_init_middle)) {
if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
j = 0;
if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
(!KMP_CPU_ISSET(i, mask))) {
ids[j++] = i;
return -1;
int gtid;
kmp_info_t *thread;
if (!TCR_4(__kmp_init_middle)) {
return -1;
gtid = __kmp_entry_gtid();
thread = __kmp_thread_from_gtid(gtid);
if (thread->th.th_current_place < 0)
return -1;
return thread->th.th_current_place;
return 0;
int gtid, num_places, first_place, last_place;
kmp_info_t *thread;
if (!TCR_4(__kmp_init_middle)) {
return 0;
gtid = __kmp_entry_gtid();
thread = __kmp_thread_from_gtid(gtid);
first_place = thread->th.th_first_place;
last_place = thread->th.th_last_place;
if (first_place < 0 || last_place < 0)
return 0;
if (first_place <= last_place)
num_places = last_place - first_place + 1;
num_places = __kmp_affinity_num_masks - first_place + last_place + 1;
return num_places;
// Nothing.
int i, gtid, place_num, first_place, last_place, start, end;
kmp_info_t *thread;
if (!TCR_4(__kmp_init_middle)) {
gtid = __kmp_entry_gtid();
thread = __kmp_thread_from_gtid(gtid);
first_place = thread->th.th_first_place;
last_place = thread->th.th_last_place;
if (first_place < 0 || last_place < 0)
if (first_place <= last_place) {
start = first_place;
end = last_place;
} else {
start = last_place;
end = first_place;
for (i = 0, place_num = start; place_num <= end; ++place_num, ++i) {
place_nums[i] = place_num;
#ifdef KMP_STUB
return 1;
return __kmp_aux_get_num_teams();
#ifdef KMP_STUB
return 0;
return __kmp_aux_get_team_num();
#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
return 0;
return __kmp_entry_thread()->th.th_current_task->td_icvs.default_device;
#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
// Nothing.
__kmp_entry_thread()->th.th_current_task->td_icvs.default_device =
// Get number of NON-HOST devices.
// libomptarget, if loaded, provides this function in api.cpp.
return 0;
int (*fptr)();
if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "_Offload_number_of_devices"))) {
return (*fptr)();
} else if ((*(void **)(&fptr) = dlsym(RTLD_NEXT, "omp_get_num_devices"))) {
return (*fptr)();
} else { // liboffload & libomptarget don't exist
return 0;
#endif // KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
// This function always returns true when called on host device.
// Compiler/libomptarget should handle when it is called inside target region.
return 1; // This is the host
// libomptarget, if loaded, provides this function
int (*fptr)();
if ((*(void **)(&fptr) = dlsym(RTLD_NEXT, "omp_get_initial_device"))) {
return (*fptr)();
} else { // liboffload & libomptarget don't exist
#if defined(KMP_STUB)
// Entries for stubs library
// As all *target* functions are C-only parameters always passed by value
void *FTN_STDCALL FTN_TARGET_ALLOC(size_t size, int device_num) { return 0; }
void FTN_STDCALL FTN_TARGET_FREE(void *device_ptr, int device_num) {}
int FTN_STDCALL FTN_TARGET_IS_PRESENT(void *ptr, int device_num) { return 0; }
int FTN_STDCALL FTN_TARGET_MEMCPY(void *dst, void *src, size_t length,
size_t dst_offset, size_t src_offset,
int dst_device, int src_device) {
return -1;
void *dst, void *src, size_t element_size, int num_dims,
const size_t *volume, const size_t *dst_offsets, const size_t *src_offsets,
const size_t *dst_dimensions, const size_t *src_dimensions, int dst_device,
int src_device) {
return -1;
int FTN_STDCALL FTN_TARGET_ASSOCIATE_PTR(void *host_ptr, void *device_ptr,
size_t size, size_t device_offset,
int device_num) {
return -1;
int FTN_STDCALL FTN_TARGET_DISASSOCIATE_PTR(void *host_ptr, int device_num) {
return -1;
#endif // defined(KMP_STUB)
#ifdef KMP_STUB
typedef enum { UNINIT = -1, UNLOCKED, LOCKED } kmp_stub_lock_t;
#endif /* KMP_STUB */
void FTN_STDCALL FTN_INIT_LOCK_WITH_HINT(void **user_lock,
uintptr_t KMP_DEREF hint) {
#ifdef KMP_STUB
*((kmp_stub_lock_t *)user_lock) = UNLOCKED;
int gtid = __kmp_entry_gtid();
__kmpc_init_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
uintptr_t KMP_DEREF hint) {
#ifdef KMP_STUB
*((kmp_stub_lock_t *)user_lock) = UNLOCKED;
int gtid = __kmp_entry_gtid();
__kmpc_init_nest_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
/* initialize the lock */
#ifdef KMP_STUB
*((kmp_stub_lock_t *)user_lock) = UNLOCKED;
int gtid = __kmp_entry_gtid();
__kmpc_init_lock(NULL, gtid, user_lock);
/* initialize the lock */
#ifdef KMP_STUB
*((kmp_stub_lock_t *)user_lock) = UNLOCKED;
int gtid = __kmp_entry_gtid();
__kmpc_init_nest_lock(NULL, gtid, user_lock);
#ifdef KMP_STUB
*((kmp_stub_lock_t *)user_lock) = UNINIT;
int gtid = __kmp_entry_gtid();
__kmpc_destroy_lock(NULL, gtid, user_lock);
#ifdef KMP_STUB
*((kmp_stub_lock_t *)user_lock) = UNINIT;
int gtid = __kmp_entry_gtid();
__kmpc_destroy_nest_lock(NULL, gtid, user_lock);
void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_LOCK)(void **user_lock) {
#ifdef KMP_STUB
if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
// TODO: Issue an error.
if (*((kmp_stub_lock_t *)user_lock) != UNLOCKED) {
// TODO: Issue an error.
*((kmp_stub_lock_t *)user_lock) = LOCKED;
int gtid = __kmp_entry_gtid();
__kmpc_set_lock(NULL, gtid, user_lock);
#ifdef KMP_STUB
if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
// TODO: Issue an error.
(*((int *)user_lock))++;
int gtid = __kmp_entry_gtid();
__kmpc_set_nest_lock(NULL, gtid, user_lock);
#ifdef KMP_STUB
if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
// TODO: Issue an error.
if (*((kmp_stub_lock_t *)user_lock) == UNLOCKED) {
// TODO: Issue an error.
*((kmp_stub_lock_t *)user_lock) = UNLOCKED;
int gtid = __kmp_entry_gtid();
__kmpc_unset_lock(NULL, gtid, user_lock);
#ifdef KMP_STUB
if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
// TODO: Issue an error.
if (*((kmp_stub_lock_t *)user_lock) == UNLOCKED) {
// TODO: Issue an error.
(*((int *)user_lock))--;
int gtid = __kmp_entry_gtid();
__kmpc_unset_nest_lock(NULL, gtid, user_lock);
#ifdef KMP_STUB
if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
// TODO: Issue an error.
if (*((kmp_stub_lock_t *)user_lock) == LOCKED) {
return 0;
*((kmp_stub_lock_t *)user_lock) = LOCKED;
return 1;
int gtid = __kmp_entry_gtid();
return __kmpc_test_lock(NULL, gtid, user_lock);
#ifdef KMP_STUB
if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
// TODO: Issue an error.
return ++(*((int *)user_lock));
int gtid = __kmp_entry_gtid();
return __kmpc_test_nest_lock(NULL, gtid, user_lock);
#ifdef KMP_STUB
return __kmps_get_wtime();
double data;
// We don't need library initialization to get the time on Linux* OS. The
// routine can be used to measure library initialization time on Linux* OS now
if (!__kmp_init_serial) {
return data;
#ifdef KMP_STUB
return __kmps_get_wtick();
double data;
if (!__kmp_init_serial) {
return data;
/* ------------------------------------------------------------------------ */
// kmpc_malloc initializes the library if needed
return kmpc_malloc(KMP_DEREF size);
size_t KMP_DEREF alignment) {
// kmpc_aligned_malloc initializes the library if needed
return kmpc_aligned_malloc(KMP_DEREF size, KMP_DEREF alignment);
void *FTN_STDCALL FTN_CALLOC(size_t KMP_DEREF nelem, size_t KMP_DEREF elsize) {
// kmpc_calloc initializes the library if needed
return kmpc_calloc(KMP_DEREF nelem, KMP_DEREF elsize);
void *FTN_STDCALL FTN_REALLOC(void *KMP_DEREF ptr, size_t KMP_DEREF size) {
// kmpc_realloc initializes the library if needed
return kmpc_realloc(KMP_DEREF ptr, KMP_DEREF size);
// does nothing if the library is not initialized
kmpc_free(KMP_DEREF ptr);
#ifndef KMP_STUB
__kmp_generate_warnings = kmp_warnings_explicit;
#ifndef KMP_STUB
__kmp_generate_warnings = FALSE;
void FTN_STDCALL FTN_SET_DEFAULTS(char const *str
int len
) {
#ifndef KMP_STUB
int len = (int)KMP_STRLEN(str);
__kmp_aux_set_defaults(str, len);
/* ------------------------------------------------------------------------ */
/* returns the status of cancellation */
#ifdef KMP_STUB
return 0 /* false */;
// initialize the library if needed
if (!__kmp_init_serial) {
return __kmp_omp_cancellation;
#ifdef KMP_STUB
return 0 /* false */;
return __kmp_get_cancellation_status(cancel_kind);
/* returns the maximum allowed task priority */
#ifdef KMP_STUB
return 0;
if (!__kmp_init_serial) {
return __kmp_max_task_priority;
// This function will be defined in libomptarget. When libomptarget is not
// loaded, we assume we are on the host and return KMP_HOST_DEVICE.
// Compiler/libomptarget will handle this if called inside target.
// Compiler will ensure that this is only called from host in sequential region
int FTN_STDCALL FTN_PAUSE_RESOURCE(kmp_pause_status_t kind, int device_num) {
#ifdef KMP_STUB
return 1; // just fail
if (device_num == KMP_HOST_DEVICE)
return __kmpc_pause_resource(kind);
else {
int (*fptr)(kmp_pause_status_t, int);
if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "tgt_pause_resource")))
return (*fptr)(kind, device_num);
return 1; // just fail if there is no libomptarget
// Compiler will ensure that this is only called from host in sequential region
int FTN_STDCALL FTN_PAUSE_RESOURCE_ALL(kmp_pause_status_t kind) {
#ifdef KMP_STUB
return 1; // just fail
int fails = 0;
int (*fptr)(kmp_pause_status_t, int);
if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "tgt_pause_resource")))
fails = (*fptr)(kind, KMP_DEVICE_ALL); // pause devices
fails += __kmpc_pause_resource(kind); // pause host
return fails;
// Returns the maximum number of nesting levels supported by implementation
#ifdef KMP_STUB
return 1;
void FTN_STDCALL FTN_FULFILL_EVENT(kmp_event_t *event) {
#ifndef KMP_STUB
// display environment variables when requested
void FTN_STDCALL FTN_DISPLAY_ENV(int verbose) {
#ifndef KMP_STUB
// GCC compatibility (versioned symbols)
/* These following sections create versioned symbols for the
omp_* routines. The KMP_VERSION_SYMBOL macro expands the API name and
then maps it to a versioned symbol.
libgomp ``versions'' its symbols (OMP_1.0, OMP_2.0, OMP_3.0, ...) while also
retaining the default version which libomp uses: VERSION (defined in
exports_so.txt). If you want to see the versioned symbols for
then just type:
objdump -T /path/to/ | grep omp_
Step 1) Create __kmp_api_omp_set_num_threads_10_alias which is alias of
Step 2) Set __kmp_api_omp_set_num_threads_10_alias to version:
Step 2B) Set __kmp_api_omp_set_num_threads to default version:
// OMP_1.0 versioned symbols
// OMP_2.0 versioned symbols
// OMP_3.0 versioned symbols
// the lock routines have a 1.0 and 3.0 version
// OMP_3.1 versioned symbol
// OMP_4.0 versioned symbols
// OMP_4.5 versioned symbols
// OMP_5.0 versioned symbols
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
// end of file //
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index bfe7765b2a96..d1511904e94b 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -1,1041 +1,1047 @@
* kmp_os.h -- KPTS runtime header file.
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef KMP_OS_H
#define KMP_OS_H
#include "kmp_config.h"
#include <stdlib.h>
#include <atomic>
#define KMP_FTN_PLAIN 1
#define KMP_FTN_APPEND 2
#define KMP_FTN_UPPER 3
#define KMP_PTR_SKIP (sizeof(void *))
/* -------------------------- Compiler variations ------------------------ */
#define KMP_OFF 0
#define KMP_ON 1
#ifndef __has_cpp_attribute
#define __has_cpp_attribute(x) 0
#ifndef __has_attribute
#define __has_attribute(x) 0
/* ------------------------- Compiler recognition ---------------------- */
#if defined(__INTEL_COMPILER)
#elif defined(__clang__)
#elif defined(__GNUC__)
#elif defined(_MSC_VER)
#error Unknown compiler
/* Check for quad-precision extension. */
#define KMP_HAVE_QUAD 0
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
/* _Quad is already defined for icc */
#define KMP_HAVE_QUAD 1
/* Clang doesn't support a software-implemented
128-bit extended precision type yet */
typedef long double _Quad;
/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad */
typedef __float128 _Quad;
#define KMP_HAVE_QUAD 1
typedef long double _Quad;
#if __LDBL_MAX_EXP__ >= 16384 && KMP_COMPILER_GCC
typedef long double _Quad;
#define KMP_HAVE_QUAD 1
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
#define KMP_USE_X87CONTROL 0
#define KMP_END_OF_LINE "\r\n"
typedef char kmp_int8;
typedef unsigned char kmp_uint8;
typedef short kmp_int16;
typedef unsigned short kmp_uint16;
typedef int kmp_int32;
typedef unsigned int kmp_uint32;
#define KMP_INT32_SPEC "d"
#define KMP_UINT32_SPEC "u"
#ifndef KMP_STRUCT64
typedef __int64 kmp_int64;
typedef unsigned __int64 kmp_uint64;
#define KMP_INT64_SPEC "I64d"
#define KMP_UINT64_SPEC "I64u"
struct kmp_struct64 {
kmp_int32 a, b;
typedef struct kmp_struct64 kmp_int64;
typedef struct kmp_struct64 kmp_uint64;
/* Not sure what to use for KMP_[U]INT64_SPEC here */
#define KMP_USE_X87CONTROL 1
#if KMP_ARCH_X86_64
#define KMP_INTPTR 1
typedef __int64 kmp_intptr_t;
typedef unsigned __int64 kmp_uintptr_t;
#define KMP_INTPTR_SPEC "I64d"
#define KMP_UINTPTR_SPEC "I64u"
#endif /* KMP_OS_WINDOWS */
#define KMP_END_OF_LINE "\n"
typedef char kmp_int8;
typedef unsigned char kmp_uint8;
typedef short kmp_int16;
typedef unsigned short kmp_uint16;
typedef int kmp_int32;
typedef unsigned int kmp_uint32;
typedef long long kmp_int64;
typedef unsigned long long kmp_uint64;
#define KMP_INT32_SPEC "d"
#define KMP_UINT32_SPEC "u"
#define KMP_INT64_SPEC "lld"
#define KMP_UINT64_SPEC "llu"
#endif /* KMP_OS_UNIX */
#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
#error "Can't determine size_t printf format specifier."
#if KMP_ARCH_X86
typedef size_t kmp_size_t;
typedef float kmp_real32;
typedef double kmp_real64;
#ifndef KMP_INTPTR
#define KMP_INTPTR 1
typedef long kmp_intptr_t;
typedef unsigned long kmp_uintptr_t;
#define KMP_INTPTR_SPEC "ld"
#define KMP_UINTPTR_SPEC "lu"
#ifdef BUILD_I8
typedef kmp_int64 kmp_int;
typedef kmp_uint64 kmp_uint;
typedef kmp_int32 kmp_int;
typedef kmp_uint32 kmp_uint;
#endif /* BUILD_I8 */
#define KMP_INT_MAX ((kmp_int32)0x7FFFFFFF)
#define KMP_INT_MIN ((kmp_int32)0x80000000)
#ifdef __cplusplus
// macros to cast out qualifiers and to re-interpret types
#define CCAST(type, var) const_cast<type>(var)
#define RCAST(type, var) reinterpret_cast<type>(var)
// template for debug prints specification ( d, u, lld, llu ), and to obtain
// signed/unsigned flavors of a type
template <typename T> struct traits_t {};
// int
template <> struct traits_t<signed int> {
typedef signed int signed_t;
typedef unsigned int unsigned_t;
typedef double floating_t;
static char const *spec;
static const signed_t max_value = 0x7fffffff;
static const signed_t min_value = 0x80000000;
static const int type_size = sizeof(signed_t);
// unsigned int
template <> struct traits_t<unsigned int> {
typedef signed int signed_t;
typedef unsigned int unsigned_t;
typedef double floating_t;
static char const *spec;
static const unsigned_t max_value = 0xffffffff;
static const unsigned_t min_value = 0x00000000;
static const int type_size = sizeof(unsigned_t);
// long
template <> struct traits_t<signed long> {
typedef signed long signed_t;
typedef unsigned long unsigned_t;
typedef long double floating_t;
static char const *spec;
static const int type_size = sizeof(signed_t);
// long long
template <> struct traits_t<signed long long> {
typedef signed long long signed_t;
typedef unsigned long long unsigned_t;
typedef long double floating_t;
static char const *spec;
static const signed_t max_value = 0x7fffffffffffffffLL;
static const signed_t min_value = 0x8000000000000000LL;
static const int type_size = sizeof(signed_t);
// unsigned long long
template <> struct traits_t<unsigned long long> {
typedef signed long long signed_t;
typedef unsigned long long unsigned_t;
typedef long double floating_t;
static char const *spec;
static const unsigned_t max_value = 0xffffffffffffffffLL;
static const unsigned_t min_value = 0x0000000000000000LL;
static const int type_size = sizeof(unsigned_t);
#define CCAST(type, var) (type)(var)
#define RCAST(type, var) (type)(var)
#endif // __cplusplus
#define KMP_EXPORT extern /* export declaration in guide libraries */
#if __GNUC__ >= 4 && !defined(__MINGW32__)
#define __forceinline __inline
#include <windows.h>
static inline int KMP_GET_PAGE_SIZE(void) {
return si.dwPageSize;
#define KMP_GET_PAGE_SIZE() getpagesize()
#define PAGE_ALIGNED(_addr) \
(!((size_t)_addr & (size_t)(KMP_GET_PAGE_SIZE() - 1)))
#define ALIGN_TO_PAGE(x) \
(void *)(((size_t)(x)) & ~((size_t)(KMP_GET_PAGE_SIZE() - 1)))
/* ---------- Support for cache alignment, padding, etc. ----------------*/
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
#define INTERNODE_CACHE_LINE 4096 /* for multi-node systems */
/* Define the default size of the cache line */
#ifndef CACHE_LINE
#define CACHE_LINE 128 /* cache line size in bytes */
#if (CACHE_LINE < 64) && !defined(KMP_OS_DARWIN)
// 2006-02-13: This produces too many warnings on OS X*. Disable for now
#warning CACHE_LINE is too small.
#endif /* CACHE_LINE */
#define KMP_CACHE_PREFETCH(ADDR) /* nothing */
// Define attribute that indicates that the fall through from the previous
// case label is intentional and should not be diagnosed by a compiler
// Code from libcxx/include/__config
// Use a function like macro to imply that it must be followed by a semicolon
#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
# define KMP_FALLTHROUGH() [[fallthrough]]
#elif __has_cpp_attribute(clang::fallthrough)
# define KMP_FALLTHROUGH() [[clang::fallthrough]]
#elif __has_attribute(fallthrough) || __GNUC__ >= 7
# define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
# define KMP_FALLTHROUGH() ((void)0)
// Define attribute that indicates a function does not return
#if __cplusplus >= 201103L
#define KMP_NORETURN [[noreturn]]
#define KMP_NORETURN __declspec(noreturn)
#define KMP_NORETURN __attribute__((noreturn))
#define KMP_ALIGN(bytes) __declspec(align(bytes))
#define KMP_THREAD_LOCAL __declspec(thread)
#define KMP_ALIAS /* Nothing */
#define KMP_ALIGN(bytes) __attribute__((aligned(bytes)))
#define KMP_THREAD_LOCAL __thread
#define KMP_ALIAS(alias_of) __attribute__((alias(alias_of)))
+#define KMP_WEAK_ATTRIBUTE_EXTERNAL __attribute__((weak))
+#define KMP_WEAK_ATTRIBUTE_EXTERNAL /* Nothing */
-#define KMP_WEAK_ATTRIBUTE __attribute__((weak))
+#define KMP_WEAK_ATTRIBUTE_INTERNAL __attribute__((weak))
-#define KMP_WEAK_ATTRIBUTE /* Nothing */
+#define KMP_WEAK_ATTRIBUTE_INTERNAL /* Nothing */
#ifndef KMP_STR
#define KMP_STR(x) _KMP_STR(x)
#define _KMP_STR(x) #x
// If using versioned symbols, KMP_EXPAND_NAME prepends
// __kmp_api_ to the real API name
#define KMP_EXPAND_NAME(api_name) _KMP_EXPAND_NAME(api_name)
#define _KMP_EXPAND_NAME(api_name) __kmp_api_##api_name
#define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str) \
_KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, "VERSION")
#define _KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, default_ver) \
__typeof__(__kmp_api_##api_name) __kmp_api_##api_name##_##ver_num##_alias \
__attribute__((alias(KMP_STR(__kmp_api_##api_name)))); \
__asm__( \
".symver " KMP_STR(__kmp_api_##api_name##_##ver_num##_alias) "," KMP_STR( \
api_name) "@" ver_str "\n\t"); \
__asm__(".symver " KMP_STR(__kmp_api_##api_name) "," KMP_STR( \
api_name) "@@" default_ver "\n\t")
#define KMP_EXPAND_NAME(api_name) api_name
#define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str) /* Nothing */
/* Temporary note: if performance testing of this passes, we can remove
all references to KMP_DO_ALIGN and replace with KMP_ALIGN. */
#define KMP_DO_ALIGN(bytes) KMP_ALIGN(bytes)
/* General purpose fence types for memory operations */
enum kmp_mem_fence_type {
kmp_no_fence, /* No memory fence */
kmp_acquire_fence, /* Acquire (read) memory fence */
kmp_release_fence, /* Release (write) memory fence */
kmp_full_fence /* Full (read+write) memory fence */
// Synchronization primitives
#pragma intrinsic(InterlockedExchangeAdd)
#pragma intrinsic(InterlockedCompareExchange)
#pragma intrinsic(InterlockedExchange)
#pragma intrinsic(InterlockedExchange64)
// Using InterlockedIncrement / InterlockedDecrement causes a library loading
// ordering problem, so we use InterlockedExchangeAdd instead.
#define KMP_TEST_THEN_INC32(p) InterlockedExchangeAdd((volatile long *)(p), 1)
#define KMP_TEST_THEN_INC_ACQ32(p) \
InterlockedExchangeAdd((volatile long *)(p), 1)
#define KMP_TEST_THEN_ADD4_32(p) InterlockedExchangeAdd((volatile long *)(p), 4)
#define KMP_TEST_THEN_ADD4_ACQ32(p) \
InterlockedExchangeAdd((volatile long *)(p), 4)
#define KMP_TEST_THEN_DEC32(p) InterlockedExchangeAdd((volatile long *)(p), -1)
#define KMP_TEST_THEN_DEC_ACQ32(p) \
InterlockedExchangeAdd((volatile long *)(p), -1)
#define KMP_TEST_THEN_ADD32(p, v) \
InterlockedExchangeAdd((volatile long *)(p), (v))
#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) \
InterlockedCompareExchange((volatile long *)(p), (long)(sv), (long)(cv))
#define KMP_XCHG_FIXED32(p, v) \
InterlockedExchange((volatile long *)(p), (long)(v))
#define KMP_XCHG_FIXED64(p, v) \
InterlockedExchange64((volatile kmp_int64 *)(p), (kmp_int64)(v))
inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
kmp_int32 tmp = InterlockedExchange((volatile long *)p, *(long *)&v);
return *(kmp_real32 *)&tmp;
// Routines that we still need to implement in assembly.
extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
kmp_int8 sv);
extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
kmp_int16 sv);
extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
kmp_int32 sv);
extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
kmp_int64 sv);
extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
kmp_int8 sv);
extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
kmp_int16 cv, kmp_int16 sv);
extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
kmp_int32 cv, kmp_int32 sv);
extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
kmp_int64 cv, kmp_int64 sv);
extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
//#define KMP_TEST_THEN_INC32(p) __kmp_test_then_add32((p), 1)
//#define KMP_TEST_THEN_INC_ACQ32(p) __kmp_test_then_add32((p), 1)
#define KMP_TEST_THEN_INC64(p) __kmp_test_then_add64((p), 1LL)
#define KMP_TEST_THEN_INC_ACQ64(p) __kmp_test_then_add64((p), 1LL)
//#define KMP_TEST_THEN_ADD4_32(p) __kmp_test_then_add32((p), 4)
//#define KMP_TEST_THEN_ADD4_ACQ32(p) __kmp_test_then_add32((p), 4)
#define KMP_TEST_THEN_ADD4_64(p) __kmp_test_then_add64((p), 4LL)
#define KMP_TEST_THEN_ADD4_ACQ64(p) __kmp_test_then_add64((p), 4LL)
//#define KMP_TEST_THEN_DEC32(p) __kmp_test_then_add32((p), -1)
//#define KMP_TEST_THEN_DEC_ACQ32(p) __kmp_test_then_add32((p), -1)
#define KMP_TEST_THEN_DEC64(p) __kmp_test_then_add64((p), -1LL)
#define KMP_TEST_THEN_DEC_ACQ64(p) __kmp_test_then_add64((p), -1LL)
//#define KMP_TEST_THEN_ADD32(p, v) __kmp_test_then_add32((p), (v))
#define KMP_TEST_THEN_ADD8(p, v) __kmp_test_then_add8((p), (v))
#define KMP_TEST_THEN_ADD64(p, v) __kmp_test_then_add64((p), (v))
#define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v))
#define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v))
#define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v))
#define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v))
#define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v))
#define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v))
#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv) \
__kmp_compare_and_store8((p), (cv), (sv))
#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv) \
__kmp_compare_and_store8((p), (cv), (sv))
#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) \
__kmp_compare_and_store16((p), (cv), (sv))
#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) \
__kmp_compare_and_store16((p), (cv), (sv))
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) \
__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) \
__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \
__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \
__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
#if KMP_ARCH_X86
#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
#else /* 64 bit pointers */
#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
#endif /* KMP_ARCH_X86 */
#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv) \
__kmp_compare_and_store_ret8((p), (cv), (sv))
#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) \
__kmp_compare_and_store_ret16((p), (cv), (sv))
#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \
__kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
#define KMP_XCHG_FIXED8(p, v) \
__kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
#define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
//#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
//#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
//#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
#elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
/* cast p to correct type so that proper intrinsic will be used */
#define KMP_TEST_THEN_INC32(p) \
__sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
#define KMP_TEST_THEN_INC_ACQ32(p) \
__sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
#define KMP_TEST_THEN_INC64(p) \
__atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
#define KMP_TEST_THEN_INC_ACQ64(p) \
__atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
#define KMP_TEST_THEN_INC64(p) \
__sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
#define KMP_TEST_THEN_INC_ACQ64(p) \
__sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
#define KMP_TEST_THEN_ADD4_32(p) \
__sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
#define KMP_TEST_THEN_ADD4_ACQ32(p) \
__sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
#define KMP_TEST_THEN_ADD4_64(p) \
__atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
#define KMP_TEST_THEN_ADD4_ACQ64(p) \
__atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
#define KMP_TEST_THEN_DEC64(p) \
__atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
#define KMP_TEST_THEN_DEC_ACQ64(p) \
__atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
#define KMP_TEST_THEN_ADD4_64(p) \
__sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
#define KMP_TEST_THEN_ADD4_ACQ64(p) \
__sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
#define KMP_TEST_THEN_DEC64(p) \
__sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
#define KMP_TEST_THEN_DEC_ACQ64(p) \
__sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
#define KMP_TEST_THEN_DEC32(p) \
__sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
#define KMP_TEST_THEN_DEC_ACQ32(p) \
__sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
#define KMP_TEST_THEN_ADD8(p, v) \
__sync_fetch_and_add((volatile kmp_int8 *)(p), (kmp_int8)(v))
#define KMP_TEST_THEN_ADD32(p, v) \
__sync_fetch_and_add((volatile kmp_int32 *)(p), (kmp_int32)(v))
#define KMP_TEST_THEN_ADD64(p, v) \
__atomic_fetch_add((volatile kmp_uint64 *)(p), (kmp_uint64)(v), \
#define KMP_TEST_THEN_ADD64(p, v) \
__sync_fetch_and_add((volatile kmp_int64 *)(p), (kmp_int64)(v))
#define KMP_TEST_THEN_OR8(p, v) \
__sync_fetch_and_or((volatile kmp_int8 *)(p), (kmp_int8)(v))
#define KMP_TEST_THEN_AND8(p, v) \
__sync_fetch_and_and((volatile kmp_int8 *)(p), (kmp_int8)(v))
#define KMP_TEST_THEN_OR32(p, v) \
__sync_fetch_and_or((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
#define KMP_TEST_THEN_AND32(p, v) \
__sync_fetch_and_and((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
#define KMP_TEST_THEN_OR64(p, v) \
__atomic_fetch_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v), \
#define KMP_TEST_THEN_AND64(p, v) \
__atomic_fetch_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v), \
#define KMP_TEST_THEN_OR64(p, v) \
__sync_fetch_and_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
#define KMP_TEST_THEN_AND64(p, v) \
__sync_fetch_and_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv) \
__sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv), \
#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv) \
__sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv), \
#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) \
__sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv), \
#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) \
__sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv), \
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) \
__sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv), \
#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) \
__sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv), \
#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
__sync_bool_compare_and_swap((void *volatile *)(p), (void *)(cv), \
(void *)(sv))
#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv) \
__sync_val_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv), \
#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) \
__sync_val_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv), \
#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) \
__sync_val_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv), \
static inline bool mips_sync_bool_compare_and_swap(
volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
return __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
static inline bool mips_sync_val_compare_and_swap(
volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
__atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
return cv;
#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \
mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \
mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \
mips_sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \
__sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \
__sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \
__sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
#define KMP_XCHG_FIXED8(p, v) \
__sync_lock_test_and_set((volatile kmp_uint8 *)(p), (kmp_uint8)(v))
#define KMP_XCHG_FIXED16(p, v) \
__sync_lock_test_and_set((volatile kmp_uint16 *)(p), (kmp_uint16)(v))
#define KMP_XCHG_FIXED32(p, v) \
__sync_lock_test_and_set((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
#define KMP_XCHG_FIXED64(p, v) \
__sync_lock_test_and_set((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
kmp_int32 tmp =
__sync_lock_test_and_set((volatile kmp_uint32 *)(p), *(kmp_uint32 *)&v);
return *(kmp_real32 *)&tmp;
inline kmp_real64 KMP_XCHG_REAL64(volatile kmp_real64 *p, kmp_real64 v) {
kmp_int64 tmp =
__sync_lock_test_and_set((volatile kmp_uint64 *)(p), *(kmp_uint64 *)&v);
return *(kmp_real64 *)&tmp;
extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
kmp_int8 sv);
extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
kmp_int16 sv);
extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
kmp_int32 sv);
extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
kmp_int64 sv);
extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
kmp_int8 sv);
extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
kmp_int16 cv, kmp_int16 sv);
extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
kmp_int32 cv, kmp_int32 sv);
extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
kmp_int64 cv, kmp_int64 sv);
extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
#define KMP_TEST_THEN_INC32(p) \
__kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
#define KMP_TEST_THEN_INC_ACQ32(p) \
__kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
#define KMP_TEST_THEN_INC64(p) \
__kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
#define KMP_TEST_THEN_INC_ACQ64(p) \
__kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
#define KMP_TEST_THEN_ADD4_32(p) \
__kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
#define KMP_TEST_THEN_ADD4_ACQ32(p) \
__kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
#define KMP_TEST_THEN_ADD4_64(p) \
__kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
#define KMP_TEST_THEN_ADD4_ACQ64(p) \
__kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
#define KMP_TEST_THEN_DEC32(p) \
__kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
#define KMP_TEST_THEN_DEC_ACQ32(p) \
__kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
#define KMP_TEST_THEN_DEC64(p) \
__kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
#define KMP_TEST_THEN_DEC_ACQ64(p) \
__kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
#define KMP_TEST_THEN_ADD8(p, v) \
__kmp_test_then_add8((volatile kmp_int8 *)(p), (kmp_int8)(v))
#define KMP_TEST_THEN_ADD32(p, v) \
__kmp_test_then_add32((volatile kmp_int32 *)(p), (kmp_int32)(v))
#define KMP_TEST_THEN_ADD64(p, v) \
__kmp_test_then_add64((volatile kmp_int64 *)(p), (kmp_int64)(v))
#define KMP_TEST_THEN_OR8(p, v) \
__kmp_test_then_or8((volatile kmp_int8 *)(p), (kmp_int8)(v))
#define KMP_TEST_THEN_AND8(p, v) \
__kmp_test_then_and8((volatile kmp_int8 *)(p), (kmp_int8)(v))
#define KMP_TEST_THEN_OR32(p, v) \
__kmp_test_then_or32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
#define KMP_TEST_THEN_AND32(p, v) \
__kmp_test_then_and32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
#define KMP_TEST_THEN_OR64(p, v) \
__kmp_test_then_or64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
#define KMP_TEST_THEN_AND64(p, v) \
__kmp_test_then_and64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv) \
__kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv), \
#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv) \
__kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv), \
#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv) \
__kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv), \
#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv) \
__kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv), \
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv) \
__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv) \
__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv) \
__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv) \
__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
#if KMP_ARCH_X86
#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
__kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
#else /* 64 bit pointers */
#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv) \
__kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
#endif /* KMP_ARCH_X86 */
#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv) \
__kmp_compare_and_store_ret8((p), (cv), (sv))
#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv) \
__kmp_compare_and_store_ret16((p), (cv), (sv))
#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv) \
__kmp_compare_and_store_ret32((volatile kmp_int32 *)(p), (kmp_int32)(cv), \
#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv) \
__kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv), \
#define KMP_XCHG_FIXED8(p, v) \
__kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
#define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
#endif /* KMP_ASM_INTRINS */
/* ------------- relaxed consistency memory model stuff ------------------ */
#ifdef __ABSOFT_WIN
#define KMP_MB() asm("nop")
#define KMP_IMB() asm("nop")
#define KMP_MB() /* _asm{ nop } */
#define KMP_IMB() /* _asm{ nop } */
#endif /* KMP_OS_WINDOWS */
#define KMP_MB() __sync_synchronize()
#ifndef KMP_MB
#define KMP_MB() /* nothing to do */
#ifndef KMP_IMB
#define KMP_IMB() /* nothing to do */
#ifndef KMP_ST_REL32
#define KMP_ST_REL32(A, D) (*(A) = (D))
#ifndef KMP_ST_REL64
#define KMP_ST_REL64(A, D) (*(A) = (D))
#ifndef KMP_LD_ACQ32
#define KMP_LD_ACQ32(A) (*(A))
#ifndef KMP_LD_ACQ64
#define KMP_LD_ACQ64(A) (*(A))
/* ------------------------------------------------------------------------ */
// FIXME - maybe this should this be
// #define TCR_4(a) (*(volatile kmp_int32 *)(&a))
// #define TCW_4(a,b) (a) = (*(volatile kmp_int32 *)&(b))
// #define TCR_8(a) (*(volatile kmp_int64 *)(a))
// #define TCW_8(a,b) (a) = (*(volatile kmp_int64 *)(&b))
// I'm fairly certain this is the correct thing to do, but I'm afraid
// of performance regressions.
#define TCR_1(a) (a)
#define TCW_1(a, b) (a) = (b)
#define TCR_4(a) (a)
#define TCW_4(a, b) (a) = (b)
#define TCI_4(a) (++(a))
#define TCD_4(a) (--(a))
#define TCR_8(a) (a)
#define TCW_8(a, b) (a) = (b)
#define TCI_8(a) (++(a))
#define TCD_8(a) (--(a))
#define TCR_SYNC_4(a) (a)
#define TCW_SYNC_4(a, b) (a) = (b)
#define TCX_SYNC_4(a, b, c) \
KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)(volatile void *)&(a), \
(kmp_int32)(b), (kmp_int32)(c))
#define TCR_SYNC_8(a) (a)
#define TCW_SYNC_8(a, b) (a) = (b)
#define TCX_SYNC_8(a, b, c) \
KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)(volatile void *)&(a), \
(kmp_int64)(b), (kmp_int64)(c))
// What about ARM?
#define TCR_PTR(a) ((void *)TCR_4(a))
#define TCW_PTR(a, b) TCW_4((a), (b))
#define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_4(a))
#define TCW_SYNC_PTR(a, b) TCW_SYNC_4((a), (b))
#define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_4((a), (b), (c)))
#else /* 64 bit pointers */
#define TCR_PTR(a) ((void *)TCR_8(a))
#define TCW_PTR(a, b) TCW_8((a), (b))
#define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_8(a))
#define TCW_SYNC_PTR(a, b) TCW_SYNC_8((a), (b))
#define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_8((a), (b), (c)))
#endif /* KMP_ARCH_X86 */
/* If these FTN_{TRUE,FALSE} values change, may need to change several places
where they are used to check that language is Fortran, not C. */
#ifndef FTN_TRUE
#ifndef FTN_FALSE
typedef void (*microtask_t)(int *gtid, int *npr, ...);
#define VOLATILE_CAST(x) (volatile x)
#define VOLATILE_CAST(x) (x)
#define KMP_WAIT __kmp_wait_4
#define KMP_WAIT_PTR __kmp_wait_4_ptr
#define KMP_EQ __kmp_eq_4
#define KMP_NEQ __kmp_neq_4
#define KMP_LT __kmp_lt_4
#define KMP_GE __kmp_ge_4
#define KMP_LE __kmp_le_4
/* Workaround for Intel(R) 64 code gen bug when taking address of static array
* (Intel(R) 64 Tracker #138) */
// Support of BGET usage
#ifndef KMP_USE_BGET
#define KMP_USE_BGET 1
// Switches for OSS builds
// Enable dynamic user lock
// Enable Intel(R) Transactional Synchronization Extensions (Intel(R) TSX) if
// dynamic user lock is turned on
// Visual studio can't handle the asm sections in this code
// Enable tick time conversion of ticks to seconds
// Warning levels
enum kmp_warnings_level {
kmp_warnings_off = 0, /* No warnings */
kmp_warnings_low, /* Minimal warnings (default) */
kmp_warnings_explicit = 6, /* Explicitly set to ON - more warnings */
kmp_warnings_verbose /* reserved */
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
// Macros for C++11 atomic functions
#define KMP_ATOMIC_LD(p, order) (p)->load(std::memory_order_##order)
#define KMP_ATOMIC_OP(op, p, v, order) (p)->op(v, std::memory_order_##order)
// For non-default load/store
#define KMP_ATOMIC_LD_ACQ(p) KMP_ATOMIC_LD(p, acquire)
#define KMP_ATOMIC_LD_RLX(p) KMP_ATOMIC_LD(p, relaxed)
#define KMP_ATOMIC_ST_REL(p, v) KMP_ATOMIC_OP(store, p, v, release)
#define KMP_ATOMIC_ST_RLX(p, v) KMP_ATOMIC_OP(store, p, v, relaxed)
// For non-default fetch_<op>
#define KMP_ATOMIC_ADD(p, v) KMP_ATOMIC_OP(fetch_add, p, v, acq_rel)
#define KMP_ATOMIC_SUB(p, v) KMP_ATOMIC_OP(fetch_sub, p, v, acq_rel)
#define KMP_ATOMIC_AND(p, v) KMP_ATOMIC_OP(fetch_and, p, v, acq_rel)
#define KMP_ATOMIC_OR(p, v) KMP_ATOMIC_OP(fetch_or, p, v, acq_rel)
#define KMP_ATOMIC_INC(p) KMP_ATOMIC_OP(fetch_add, p, 1, acq_rel)
#define KMP_ATOMIC_DEC(p) KMP_ATOMIC_OP(fetch_sub, p, 1, acq_rel)
#define KMP_ATOMIC_ADD_RLX(p, v) KMP_ATOMIC_OP(fetch_add, p, v, relaxed)
#define KMP_ATOMIC_INC_RLX(p) KMP_ATOMIC_OP(fetch_add, p, 1, relaxed)
// Callers of the following functions cannot see the side effect on "expected".
template <typename T>
bool __kmp_atomic_compare_store(std::atomic<T> *p, T expected, T desired) {
return p->compare_exchange_strong(
expected, desired, std::memory_order_acq_rel, std::memory_order_relaxed);
template <typename T>
bool __kmp_atomic_compare_store_acq(std::atomic<T> *p, T expected, T desired) {
return p->compare_exchange_strong(
expected, desired, std::memory_order_acquire, std::memory_order_relaxed);
template <typename T>
bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
return p->compare_exchange_strong(
expected, desired, std::memory_order_release, std::memory_order_relaxed);
#endif /* KMP_OS_H */
// Safe C API
#include "kmp_safe_c_api.h"
diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
index a7288f08a661..9be699110fc6 100644
--- a/openmp/runtime/src/ompt-specific.cpp
+++ b/openmp/runtime/src/ompt-specific.cpp
@@ -1,504 +1,504 @@
* ompt-specific.cpp -- OMPT internal functions
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// include files
#include "kmp.h"
#include "ompt-specific.h"
#include <dlfcn.h>
#define THREAD_LOCAL __declspec(thread)
#define THREAD_LOCAL __thread
// macros
#define LWT_FROM_TEAM(team) (team)->t.ompt_serialized_team_info
// private operations
// traverse the team and task hierarchy
// note: __ompt_get_teaminfo and __ompt_get_task_info_object
// traverse the hierarchy similarly and need to be
// kept consistent
ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size) {
kmp_info_t *thr = ompt_get_thread();
if (thr) {
kmp_team *team = thr->th.th_team;
if (team == NULL)
return NULL;
ompt_lw_taskteam_t *next_lwt = LWT_FROM_TEAM(team), *lwt = NULL;
while (depth > 0) {
// next lightweight team (if any)
if (lwt)
lwt = lwt->parent;
// next heavyweight team (if any) after
// lightweight teams are exhausted
if (!lwt && team) {
if (next_lwt) {
lwt = next_lwt;
next_lwt = NULL;
} else {
team = team->t.t_parent;
if (team) {
next_lwt = LWT_FROM_TEAM(team);
if (lwt) {
// lightweight teams have one task
if (size)
*size = 1;
// return team info for lightweight team
return &lwt->ompt_team_info;
} else if (team) {
// extract size from heavyweight team
if (size)
*size = team->t.t_nproc;
// return team info for heavyweight team
return &team->t.ompt_team_info;
return NULL;
ompt_task_info_t *__ompt_get_task_info_object(int depth) {
ompt_task_info_t *info = NULL;
kmp_info_t *thr = ompt_get_thread();
if (thr) {
kmp_taskdata_t *taskdata = thr->th.th_current_task;
ompt_lw_taskteam_t *lwt = NULL,
*next_lwt = LWT_FROM_TEAM(taskdata->td_team);
while (depth > 0) {
// next lightweight team (if any)
if (lwt)
lwt = lwt->parent;
// next heavyweight team (if any) after
// lightweight teams are exhausted
if (!lwt && taskdata) {
if (next_lwt) {
lwt = next_lwt;
next_lwt = NULL;
} else {
taskdata = taskdata->td_parent;
if (taskdata) {
next_lwt = LWT_FROM_TEAM(taskdata->td_team);
if (lwt) {
info = &lwt->ompt_task_info;
} else if (taskdata) {
info = &taskdata->ompt_task_info;
return info;
ompt_task_info_t *__ompt_get_scheduling_taskinfo(int depth) {
ompt_task_info_t *info = NULL;
kmp_info_t *thr = ompt_get_thread();
if (thr) {
kmp_taskdata_t *taskdata = thr->th.th_current_task;
ompt_lw_taskteam_t *lwt = NULL,
*next_lwt = LWT_FROM_TEAM(taskdata->td_team);
while (depth > 0) {
// next lightweight team (if any)
if (lwt)
lwt = lwt->parent;
// next heavyweight team (if any) after
// lightweight teams are exhausted
if (!lwt && taskdata) {
// first try scheduling parent (for explicit task scheduling)
if (taskdata->ompt_task_info.scheduling_parent) {
taskdata = taskdata->ompt_task_info.scheduling_parent;
} else if (next_lwt) {
lwt = next_lwt;
next_lwt = NULL;
} else {
// then go for implicit tasks
taskdata = taskdata->td_parent;
if (taskdata) {
next_lwt = LWT_FROM_TEAM(taskdata->td_team);
if (lwt) {
info = &lwt->ompt_task_info;
} else if (taskdata) {
info = &taskdata->ompt_task_info;
return info;
// interface operations
// thread support
ompt_data_t *__ompt_get_thread_data_internal() {
if (__kmp_get_gtid() >= 0) {
kmp_info_t *thread = ompt_get_thread();
if (thread == NULL)
return NULL;
return &(thread->th.ompt_thread_info.thread_data);
return NULL;
// state support
void __ompt_thread_assign_wait_id(void *variable) {
kmp_info_t *ti = ompt_get_thread();
if (ti)
ti->th.ompt_thread_info.wait_id = (ompt_wait_id_t)(uintptr_t)variable;
int __ompt_get_state_internal(ompt_wait_id_t *omp_wait_id) {
kmp_info_t *ti = ompt_get_thread();
if (ti) {
if (omp_wait_id)
*omp_wait_id = ti->th.ompt_thread_info.wait_id;
return ti->th.ompt_thread_info.state;
return ompt_state_undefined;
// parallel region support
int __ompt_get_parallel_info_internal(int ancestor_level,
ompt_data_t **parallel_data,
int *team_size) {
if (__kmp_get_gtid() >= 0) {
ompt_team_info_t *info;
if (team_size) {
info = __ompt_get_teaminfo(ancestor_level, team_size);
} else {
info = __ompt_get_teaminfo(ancestor_level, NULL);
if (parallel_data) {
*parallel_data = info ? &(info->parallel_data) : NULL;
return info ? 2 : 0;
} else {
return 0;
// lightweight task team support
void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
ompt_data_t *ompt_pid, void *codeptr) {
// initialize parallel_data with input, return address to parallel_data on
// exit
lwt->ompt_team_info.parallel_data = *ompt_pid;
lwt->ompt_team_info.master_return_address = codeptr;
lwt->ompt_task_info.task_data.value = 0;
lwt->ompt_task_info.frame.enter_frame = ompt_data_none;
lwt->ompt_task_info.frame.exit_frame = ompt_data_none;
lwt->ompt_task_info.scheduling_parent = NULL;
lwt->heap = 0;
lwt->parent = 0;
void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
int on_heap, bool always) {
ompt_lw_taskteam_t *link_lwt = lwt;
if (always ||
thr->th.th_team->t.t_serialized >
1) { // we already have a team, so link the new team and swap values
if (on_heap) { // the lw_taskteam cannot stay on stack, allocate it on heap
link_lwt =
(ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
link_lwt->heap = on_heap;
// would be swap in the (on_stack) case.
ompt_team_info_t tmp_team = lwt->ompt_team_info;
link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
*OMPT_CUR_TEAM_INFO(thr) = tmp_team;
ompt_task_info_t tmp_task = lwt->ompt_task_info;
link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
*OMPT_CUR_TASK_INFO(thr) = tmp_task;
// link the taskteam into the list of taskteams:
ompt_lw_taskteam_t *my_parent =
link_lwt->parent = my_parent;
thr->th.th_team->t.ompt_serialized_team_info = link_lwt;
} else {
// this is the first serialized team, so we just store the values in the
// team and drop the taskteam-object
*OMPT_CUR_TEAM_INFO(thr) = lwt->ompt_team_info;
*OMPT_CUR_TASK_INFO(thr) = lwt->ompt_task_info;
void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
if (lwtask) {
thr->th.th_team->t.ompt_serialized_team_info = lwtask->parent;
ompt_team_info_t tmp_team = lwtask->ompt_team_info;
lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
*OMPT_CUR_TEAM_INFO(thr) = tmp_team;
ompt_task_info_t tmp_task = lwtask->ompt_task_info;
lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
*OMPT_CUR_TASK_INFO(thr) = tmp_task;
if (lwtask->heap) {
lwtask = NULL;
// return lwtask;
// task support
int __ompt_get_task_info_internal(int ancestor_level, int *type,
ompt_data_t **task_data,
ompt_frame_t **task_frame,
ompt_data_t **parallel_data,
int *thread_num) {
if (__kmp_get_gtid() < 0)
return 0;
if (ancestor_level < 0)
return 0;
// copied from __ompt_get_scheduling_taskinfo
ompt_task_info_t *info = NULL;
ompt_team_info_t *team_info = NULL;
kmp_info_t *thr = ompt_get_thread();
int level = ancestor_level;
if (thr) {
kmp_taskdata_t *taskdata = thr->th.th_current_task;
if (taskdata == NULL)
return 0;
kmp_team *team = thr->th.th_team, *prev_team = NULL;
if (team == NULL)
return 0;
ompt_lw_taskteam_t *lwt = NULL,
*next_lwt = LWT_FROM_TEAM(taskdata->td_team),
*prev_lwt = NULL;
while (ancestor_level > 0) {
// needed for thread_num
prev_team = team;
prev_lwt = lwt;
// next lightweight team (if any)
if (lwt)
lwt = lwt->parent;
// next heavyweight team (if any) after
// lightweight teams are exhausted
if (!lwt && taskdata) {
// first try scheduling parent (for explicit task scheduling)
if (taskdata->ompt_task_info.scheduling_parent) {
taskdata = taskdata->ompt_task_info.scheduling_parent;
} else if (next_lwt) {
lwt = next_lwt;
next_lwt = NULL;
} else {
// then go for implicit tasks
taskdata = taskdata->td_parent;
if (team == NULL)
return 0;
team = team->t.t_parent;
if (taskdata) {
next_lwt = LWT_FROM_TEAM(taskdata->td_team);
if (lwt) {
info = &lwt->ompt_task_info;
team_info = &lwt->ompt_team_info;
if (type) {
*type = ompt_task_implicit;
} else if (taskdata) {
info = &taskdata->ompt_task_info;
team_info = &team->t.ompt_team_info;
if (type) {
if (taskdata->td_parent) {
*type = (taskdata->td_flags.tasktype ? ompt_task_explicit
: ompt_task_implicit) |
} else {
*type = ompt_task_initial;
if (task_data) {
*task_data = info ? &info->task_data : NULL;
if (task_frame) {
// OpenMP spec asks for the scheduling task to be returned.
*task_frame = info ? &info->frame : NULL;
if (parallel_data) {
*parallel_data = team_info ? &(team_info->parallel_data) : NULL;
if (thread_num) {
if (level == 0)
*thread_num = __kmp_get_tid();
else if (prev_lwt)
*thread_num = 0;
*thread_num = prev_team->t.t_master_tid;
// *thread_num = team->t.t_master_tid;
return info ? 2 : 0;
return 0;
int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
if (blocknum != 0)
return 0; // support only a single block
kmp_info_t *thr = ompt_get_thread();
if (!thr)
return 0;
kmp_taskdata_t *taskdata = thr->th.th_current_task;
kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
if (taskdata->td_flags.tasktype != TASK_EXPLICIT)
return 0; // support only explicit task
void *ret_addr;
int64_t ret_size = taskdata->td_size_alloc - sizeof(kmp_taskdata_t);
// kmp_task_t->data1 is an optional member
if (taskdata->td_flags.destructors_thunk)
ret_addr = &task->data1 + 1;
ret_addr = &task->part_id + 1;
ret_size -= (char *)(ret_addr) - (char *)(task);
if (ret_size < 0)
return 0;
*addr = ret_addr;
*size = ret_size;
return 1;
// team support
void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid) {
team->t.ompt_team_info.parallel_data = ompt_pid;
// misc
static uint64_t __ompt_get_unique_id_internal() {
static uint64_t thread = 1;
static THREAD_LOCAL uint64_t ID = 0;
if (ID == 0) {
uint64_t new_thread = KMP_TEST_THEN_INC64((kmp_int64 *)&thread);
ID = new_thread << (sizeof(uint64_t) * 8 - OMPT_THREAD_ID_BITS);
return ++ID;
ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt,
kmp_info_t *thr) {
if (bt == bs_forkjoin_barrier)
return ompt_sync_region_barrier_implicit;
if (bt != bs_plain_barrier)
return ompt_sync_region_barrier_implementation;
if (!thr->th.th_ident)
return ompt_sync_region_barrier;
kmp_int32 flags = thr->th.th_ident->flags;
if ((flags & KMP_IDENT_BARRIER_EXPL) != 0)
return ompt_sync_region_barrier_explicit;
if ((flags & KMP_IDENT_BARRIER_IMPL) != 0)
return ompt_sync_region_barrier_implicit;
return ompt_sync_region_barrier_implementation;

File Metadata

Mime Type
Wed, Jul 3, 9:54 AM (33 m, 12 s)
Storage Engine
Storage Format
Storage Handle
Default Alt Text
(7 MB)

Event Timeline